In [1]:
import sys
sys.path.insert(0, '/Users/matthewashman/github/MasterProject2018')

# Import necessary modules. Set settings. Import data.
import math
import numpy as np
import pandas as pd
import random
import pywt
import fastdtw
import matplotlib.pyplot as plt
from statsmodels.robust import mad
from tsfresh.feature_extraction import feature_calculators
from FeatureExtraction.feature_tools import detect_peaks
from IPython.display import display, clear_output, HTML
import re
import seaborn as sns
sns.set(style="whitegrid")

import pdb

plt.style.use('default')

X = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/extracted_segments_with_labels_updated.pkl')

  from pandas.core import datetools


Remove bad labels

In [2]:
X['Label'].unique()

array(['0', nan, '1', '2', '-1'], dtype=object)

In [3]:
X_S1 = X[X['S1/S2']=='S1']
X_S2 = X[X['S1/S2']=='S2']
X_S2 = X_S2[(X_S2['Label']=='0') |(X_S2['Label']=='1') | (X_S2['Label']=='2')]

## Extract Three Groups of Features From Each Segment

In [22]:
temporal_feature_list = []
spectral_feature_list = []
hand_engineered_feature_list = []

temporal_feature_list_nonorm = []
spectral_feature_list_nonorm = []
hand_engineered_feature_list_nonorm = []

for i,row in X_S2.iterrows():
    clear_output(wait=True)
    display('Extracting Features: ' + str(round((i/X_S2.index[-1])*100, 3)) + '%')
    
    # Get the patients response to the first S1 stimuli as the reference response
    # Get typical response for this patient and channel
    # Bad apples
    if (((row['Type'] + row['Patient']) == 'af8') & (row['Channel'] == 'CS5-6')):
        typical_response = X_S1[(X_S1['Type']==row['Type']) & 
                           (X_S1['Patient']==row['Patient']) &
                           (X_S1['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[2]
    elif (((row['Type'] + row['Patient']) == 'at1') & (row['Channel'] == 'CS1-2')):
        typical_response = X_S1[(X_S1['Type']==row['Type']) & 
                           (X_S1['Patient']==row['Patient']) &
                           (X_S1['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[4]
    elif (((row['Type'] + row['Patient']) == 'avnrt10') & (row['Channel'] == 'CS1-2')):
        typical_response = X_S1[(X_S1['Type']==row['Type']) & 
                           (X_S1['Patient']==row['Patient']) &
                           (X_S1['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[1]
    elif (((row['Type'] + row['Patient']) == 'avrt13') & (row['Channel'] == 'CS1-2')):
        typical_response = X_S2[(X_S2['Type']==row['Type']) & 
                           (X_S2['Patient']==row['Patient']) &
                           (X_S2['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    elif (((row['Type'] + row['Patient']) == 'af14') & (row['Channel'] == 'CS1-2')):
        typical_response = X_S2[(X_S2['Type']==row['Type']) & 
                           (X_S2['Patient']==row['Patient']) &
                           (X_S2['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    else:
        typical_response = X_S1[(X_S1['Type']==row['Type']) & 
                               (X_S1['Patient']==row['Patient']) &
                               (X_S1['Channel']==row['Channel'])
                               ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
        
    # Normalise amplitudes with respect to the typical response amplitude.
    s1_response = typical_response['Data']/max(abs(typical_response['Data']))
    s2_response = row['Data']/max(abs(typical_response['Data']))
        
    ref_temporal_feature_dict = get_temporal_feature_dict(s1_response, col_prefix = '')
    ref_spectral_feature_dict = get_spectral_feature_dict(s1_response, col_prefix = '')
    ref_hand_engineered_feature_dict = get_hand_engineered_feature_dict(s1_response, col_prefix = '')
    
    temporal_feature_dict = get_temporal_feature_dict(s2_response, col_prefix = '')
    spectral_feature_dict = get_spectral_feature_dict(s2_response, col_prefix = '')
    hand_engineered_feature_dict = get_hand_engineered_feature_dict(s2_response, col_prefix = '')
    
    temporal_feature_dict_nonorm = temporal_feature_dict.copy()
    spectral_feature_dict_nonorm = spectral_feature_dict.copy()
    hand_engineered_feature_dict_nonorm = hand_engineered_feature_dict.copy()

    
    for (k_t, v_t), (k_s, v_s), (k_he, v_he) in zip(temporal_feature_dict.items(), spectral_feature_dict.items(), hand_engineered_feature_dict.items()):
        temporal_feature_dict[k_t] = v_t - ref_temporal_feature_dict[k_t]
        spectral_feature_dict[k_s] = v_s - ref_spectral_feature_dict[k_s]
        hand_engineered_feature_dict[k_he] = v_he - ref_hand_engineered_feature_dict[k_he]
        
    temporal_feature_dict['Label'] = row['Label']
    temporal_feature_dict['Channel'] = row['Channel']
    spectral_feature_dict['Label'] = row['Label']
    spectral_feature_dict['Channel'] = row['Channel']
    hand_engineered_feature_dict['Label'] = row['Label']
    hand_engineered_feature_dict['Channel'] = row['Channel']
    
    temporal_feature_dict_nonorm['Label'] = row['Label']
    temporal_feature_dict_nonorm['Channel'] = row['Channel']
    spectral_feature_dict_nonorm['Label'] = row['Label']
    spectral_feature_dict_nonorm['Channel'] = row['Channel']
    hand_engineered_feature_dict_nonorm['Label'] = row['Label']
    hand_engineered_feature_dict_nonorm['Channel'] = row['Channel']
    
    temporal_feature_list.append(temporal_feature_dict)
    spectral_feature_list.append(spectral_feature_dict)
    hand_engineered_feature_list.append(hand_engineered_feature_dict)
    
    temporal_feature_list_nonorm.append(temporal_feature_dict_nonorm)
    spectral_feature_list_nonorm.append(spectral_feature_dict_nonorm)
    hand_engineered_feature_list_nonorm.append(hand_engineered_feature_dict_nonorm)

'Extracting Features: 100.0%'

In [23]:
temporal_features = pd.DataFrame(temporal_feature_list)
spectral_features = pd.DataFrame(spectral_feature_list)
hand_engineered_features = pd.DataFrame(hand_engineered_feature_list)

temporal_features_nonorm = pd.DataFrame(temporal_feature_list_nonorm)
spectral_features_nonorm = pd.DataFrame(spectral_feature_list_nonorm)
hand_engineered_features_nonorm = pd.DataFrame(hand_engineered_feature_list_nonorm)

In [24]:
temporal_features_nonorm.head()

Unnamed: 0,Channel,Label,Maximum Absolute Value,Ratio Beyond 1xSTD
0,CS1-2,0,1.338288,0.107692
1,CS3-4,0,1.510449,0.092308
2,CS5-6,0,1.449142,0.1
3,CS1-2,0,1.340258,0.107692
4,CS3-4,0,1.568009,0.092308


## Plotting Variations in Feature Values

In [25]:
%matplotlib qt
feature_names = temporal_features.drop(['Channel', 'Label'], axis=1).columns.tolist()

sns.set(style="whitegrid")
# Make plot for each feature
fig, axes = plt.subplots(ncols=2, nrows=len(feature_names), figsize=(16,9))
plt.rcParams.update({'font.size': 6})
for i,feature in enumerate(feature_names):
    axes[i,0] = sns.boxplot(x='Label', y=feature, palette='Set2', data=temporal_features, ax=axes[i,0])
    feature_ylabel = re.sub(r'(:)', r'\1\n', feature)
    axes[i,0].set_ylabel(ylabel=feature_ylabel, fontsize=8)
    axes[i,0].set(xlabel='')
    for p in (0, 1, 2):
        y = temporal_features[temporal_features['Label']==str(p)][feature].dropna()
        x = np.random.normal(p, 0.04, size=len(y))
        axes[i,0].plot(x, y, 'r.', alpha=0.2)
    
    axes[i,1] = sns.boxplot(x='Label', y=feature, palette='Set2', data=temporal_features_nonorm, ax=axes[i,1])
    feature_ylabel = re.sub(r'(:)', r'\1\n', feature)
    axes[i,1].set_ylabel(ylabel=feature_ylabel, fontsize=8)
    axes[i,1].set(xlabel='')
    for p in (0, 1, 2):
        y = temporal_features_nonorm[temporal_features_nonorm['Label']==str(p)][feature].dropna()
        x = np.random.normal(p, 0.04, size=len(y))
        axes[i,1].plot(x, y, 'r.', alpha=0.2)
        
plt.show()

In [27]:
%matplotlib qt
feature_names = spectral_features.drop(['Channel', 'Label'], axis=1).columns.tolist()

sns.set(style="whitegrid")
# Make plot for each feature
fig, axes = plt.subplots(ncols=2, nrows=len(feature_names), figsize=(16,9))
plt.rcParams.update({'font.size': 6})
for i,feature in enumerate(feature_names):
    axes[i,0] = sns.boxplot(x='Label', y=feature, palette='Set2', data=spectral_features, ax=axes[i,0])
    feature_ylabel = re.sub(r'(:)', r'\1\n', feature)
    axes[i,0].set_ylabel(ylabel=feature_ylabel, fontsize=8)
    axes[i,0].set(xlabel='')
    for p in (0, 1, 2):
        y = spectral_features[spectral_features['Label']==str(p)][feature].dropna()
        x = np.random.normal(p, 0.04, size=len(y))
        axes[i,0].plot(x, y, 'r.', alpha=0.2)
    
    axes[i,1] = sns.boxplot(x='Label', y=feature, palette='Set2', data=spectral_features_nonorm, ax=axes[i,1])
    feature_ylabel = re.sub(r'(:)', r'\1\n', feature)
    axes[i,1].set_ylabel(ylabel=feature_ylabel, fontsize=8)
    axes[i,1].set(xlabel='')
    for p in (0, 1, 2):
        y = spectral_features_nonorm[spectral_features_nonorm['Label']==str(p)][feature].dropna()
        x = np.random.normal(p, 0.04, size=len(y))
        axes[i,1].plot(x, y, 'r.', alpha=0.2)
        
plt.show()

In [34]:
%matplotlib qt
feature_names = hand_engineered_features.drop(['Channel', 'Label'], axis=1).columns.tolist()

sns.set(style="whitegrid")
# Make plot for each feature
plt.rcParams.update({'font.size': 6})
for i,feature in enumerate(feature_names):
    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(16,9))
    axes[0] = sns.boxplot(x='Label', y=feature, palette='Set2', data=hand_engineered_features, ax=axes[0])
    feature_ylabel = re.sub(r'(:)', r'\1\n', feature)
    axes[0].set_ylabel(ylabel=feature_ylabel, fontsize=8)
    axes[0].set(xlabel='')
    for p in (0, 1, 2):
        y = hand_engineered_features[hand_engineered_features['Label']==str(p)][feature].dropna()
        x = np.random.normal(p, 0.04, size=len(y))
        axes[0].plot(x, y, 'r.', alpha=0.2)
    
    axes[1] = sns.boxplot(x='Label', y=feature, palette='Set2', data=hand_engineered_features_nonorm, ax=axes[1])
    feature_ylabel = re.sub(r'(:)', r'\1\n', feature)
    axes[1].set_ylabel(ylabel=feature_ylabel, fontsize=8)
    axes[1].set(xlabel='')
    for p in (0, 1, 2):
        y = hand_engineered_features_nonorm[hand_engineered_features_nonorm['Label']==str(p)][feature].dropna()
        x = np.random.normal(p, 0.04, size=len(y))
        axes[1].plot(x, y, 'r.', alpha=0.2)
        
    plt.draw()
    plt.waitforbuttonpress()
    plt.close()

## Correlation Between Features and Label

In [9]:
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = numerator/denominator
    return eta

In [10]:
feature_names = temporal_features.drop(['Channel', 'Label'], axis=1).columns
print('Temporal Features\n')
for feature in feature_names:
    label = (temporal_features['Label']=='2').astype(int)
    feature_values = temporal_features[feature].values
    eta = correlation_ratio(label, feature_values)
    print(feature + ': ')
    print(eta)

print('\n Spectral Features\n')
feature_names = spectral_features.drop(['Channel', 'Label'], axis=1).columns
for feature in feature_names:
    label = (spectral_features['Label']=='2').astype(int)
    feature_values = spectral_features[feature].values
    eta = correlation_ratio(label, feature_values)
    print(feature + ': ')
    print(eta)
    
print('\n Hand Engineered Features\n')
feature_names = hand_engineered_features.drop(['Channel', 'Label'], axis=1).columns
for feature in feature_names:
    label = (hand_engineered_features['Label']=='2').astype(int)
    feature_values = hand_engineered_features[feature].values
    eta = correlation_ratio(label, feature_values)
    print(feature + ': ')
    print(eta)
    
print('\n DTW Distance')
feature_values = dtw_df['DTW Distance'].values
eta = correlation_ratio(label, feature_values)
print(eta)

Temporal Features

Approximate Entropy: m=3 r=0.7: 
0.23531762597696576
Energy Ratio by Chunks: num_segments=10 segment_focus=2: 
0.006561813976145857
Energy Ratio by Chunks: num_segments=10 segment_focus=3: 
0.022128593397454745
Index Mass Quantile: q=0.4: 
0.12484643817214154
Index Mass Quantile: q=0.6: 
0.14200936919564663
Maximum Absolute Value: 
0.012078767067050808
Ratio Beyond 1xSTD: 
0.14106703968612852

 Spectral Features

Power Spectral Entropy: 
nan
Power Spectral Entropy Around Maximum Energy: width=60: 
0.004669478111697543
Spectral Centroid: 
0.0017868944358547217
Spectral Centroid Around Maximum Energy: width=60: 
0.0013636814429217

 Hand Engineered Features

Conduction Delay: set_thresh=False: 
0.08127395732253252
Energy Around Max Energy: 
0.0008168402358414893
Location of Maximum Energy: M=14: 
0.12415559866072236
Number of Peaks: set_thresh=False: 
0.24110149079797785
Number of Peaks: set_thresh=True: 
0.10215937669754108
Peaks Between Min and Max: 
0.03791395094971

## Boxplots of Selected Features

In [None]:
%matplotlib qt
# fig = plt.figure(figsize=(8,4))
# ax = sns.boxplot(x='Label', y='Ratio Beyond 1xSTD', palette=['g', 'orange', 'r'], data=temporal_features)
# feature_ylabel = re.sub(r'(:)', r'\1\n', 'Ratio Above 1xSTD')
# # ax.set_ylabel(ylabel=feature_ylabel, fontsize=8)
# ax.set(xlabel='')
# ax.set(xticks=[])
# ax.set(ylabel='')
# for p in (0, 1, 2):
#     y = temporal_features[temporal_features['Label']==str(p)]['Ratio Beyond 1xSTD'].dropna()
#     x = np.random.normal(p, 0.04, size=len(y))
#     ax.plot(x, y, 'k.', alpha=0.2)
    
# plt.title('Ratio Above $\sigma$', fontsize=16)
# plt.yticks(fontsize=14)
# plt.draw()
# plt.waitforbuttonpress()
# plt.close()

# feature_names = hand_engineered_features.drop(['Channel', 'Label'], axis=1).columns
# feature_titles = ('Conduction Delay', 'Energy Around Max Energy', 'Location of Maximum Energy', 
#                   'Number of Peaks', 'Number of Peaks Bad', 'Peaks Between Min/Max', 'Percentage Fractionation',
#                  'Sample Entropy', 'Width of Max Energy')
# for feature, feature_title in zip(feature_names, feature_titles):
#     fig = plt.figure(figsize=(8,4))
#     ax = sns.boxplot(x='Label', y=feature, palette=['g', 'orange', 'r'], data=hand_engineered_features)
#     ax.set(xlabel='')
#     ax.set(xticks=[])
#     ax.set(ylabel='')
#     for p in (0, 1, 2):
#         y = hand_engineered_features[hand_engineered_features['Label']==str(p)][feature].dropna()
#         x = np.random.normal(p, 0.04, size=len(y))
#         ax.plot(x, y, 'k.', alpha=0.2)

#     plt.title(feature_title, fontsize=16)
#     plt.yticks(fontsize=14)
#     plt.draw()
#     plt.waitforbuttonpress()
#     plt.close()
    
fig = plt.figure(figsize=(8,4))
ax = sns.boxplot(x='Label', y='DTW Distance', palette=['g', 'orange', 'r'], data=dtw_df)
feature_ylabel = re.sub(r'(:)', r'\1\n', 'Ratio Above 1xSTD')
# ax.set_ylabel(ylabel=feature_ylabel, fontsize=8)
ax.set(xlabel='')
ax.set(xticks=[])
ax.set(ylabel='')
for p in (0, 1, 2):
    y = dtw_df[dtw_df['Label']==str(p)]['DTW Distance'].dropna()
    x = np.random.normal(p, 0.04, size=len(y))
    ax.plot(x, y, 'k.', alpha=0.2)
    
plt.title('DTW Distance', fontsize=16)
plt.yticks(fontsize=14)
plt.draw()
plt.waitforbuttonpress()
plt.close()

fig = plt.figure(figsize=(8,4))
ax = sns.boxplot(x='Label', y='Normalised Mean', palette=['g', 'orange', 'r'], data=hand_picked_features)
# ax.set_ylabel(ylabel=feature_ylabel, fontsize=8)
ax.set(xlabel='')
ax.set(xticks=[])
ax.set(ylabel='')
for p in (0, 1, 2):
    y = hand_picked_features[hand_picked_features['Label']==str(p)]['Normalised Mean'].dropna()
    x = np.random.normal(p, 0.04, size=len(y))
    ax.plot(x, y, 'k.', alpha=0.2)
    
plt.title('Mean Absolute Value', fontsize=16)
plt.yticks(fontsize=14)
plt.draw()
plt.waitforbuttonpress()
plt.close()

## Looking at Specific Features

In [41]:
hand_picked_feature_list = []
dtw_list = []

for i,row in X_S2.iterrows():
    clear_output(wait=True)
    display('Extracting Features: ' + str(round((i/X_S2.index[-1])*100, 3)) + '%')
    
    # Get the patients response to the first S1 stimuli as the reference response
    # Get typical response for this patient and channel
    # Bad apples
    if (((row['Type'] + row['Patient']) == 'af8') & (row['Channel'] == 'CS5-6')):
        typical_response = X_S1[(X_S1['Type']==row['Type']) & 
                           (X_S1['Patient']==row['Patient']) &
                           (X_S1['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[2]
    elif (((row['Type'] + row['Patient']) == 'at1') & (row['Channel'] == 'CS1-2')):
        typical_response = X_S1[(X_S1['Type']==row['Type']) & 
                           (X_S1['Patient']==row['Patient']) &
                           (X_S1['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[4]
    else:
        typical_response = X_S1[(X_S1['Type']==row['Type']) & 
                               (X_S1['Patient']==row['Patient']) &
                               (X_S1['Channel']==row['Channel'])
                               ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
        
    # Normalise amplitudes with respect to the typical response amplitude.
    s1_response = typical_response['Data']/max(abs(typical_response['Data']))
    s2_response = row['Data']/max(abs(typical_response['Data']))
    
    ref_hand_picked_feature_dict = get_hand_picked_feature_dict(s1_response, col_prefix = '')
    hand_picked_feature_dict = get_hand_picked_feature_dict(s2_response, col_prefix = '')

    
    for k, v in hand_picked_feature_dict.items():
        hand_picked_feature_dict[k] = v - ref_hand_picked_feature_dict[k]
        
    hand_picked_feature_dict['Label'] = row['Label']
    hand_picked_feature_dict['Channel'] = row['Channel']
    
#     fdtw = fastdtw.dtw(s2_response, s1_response)
#     dtw_dict = {}
#     dtw_dict['DTW Distance'] = fdtw[0]
#     dtw_dict['Label'] = row['Label']
#     dtw_dict['Channel'] = row['Channel']
    
    hand_picked_feature_list.append(hand_picked_feature_dict)
#     dtw_list.append(dtw_dict)

'Extracting Features: 100.0%'

In [42]:
hand_picked_features = pd.DataFrame(hand_picked_feature_list)
# dtw_df = pd.DataFrame(dtw_list)

In [43]:
%matplotlib qt
feature_names = hand_picked_features.drop(['Channel', 'Label'], axis=1).columns
feature_names = feature_names.tolist()
feature_names.append(feature_names[0])
sns.set(style="whitegrid")
# Make plot for each feature
fig, axes = plt.subplots(ncols=2, nrows=math.ceil(len(feature_names)/2), figsize=(16,9))
plt.rcParams.update({'font.size': 6})
for i,feature in enumerate(feature_names):
    if (i < math.ceil(len(feature_names)/2)):
        axes[i,0] = sns.boxplot(x='Label', y=feature, palette='Set2', data=hand_picked_features, ax=axes[i,0])
        feature_ylabel = re.sub(r'(:)', r'\1\n', feature)
        axes[i,0].set_ylabel(ylabel=feature_ylabel, fontsize=8)
        axes[i,0].set(xlabel='')
        for p in (0, 1, 2):
            y = hand_picked_features[hand_picked_features['Label']==str(p)][feature].dropna()
            x = np.random.normal(p, 0.04, size=len(y))
            axes[i,0].plot(x, y, 'r.', alpha=0.2)
    else:
        j = i - math.ceil(len(feature_names)/2)
        axes[j,1] = sns.boxplot(x='Label', y=feature, palette='Set2', data=hand_picked_features, ax=axes[j,1])
        feature_ylabel = re.sub(r'(:)', r'\1\n', feature)
        axes[j,1].set_ylabel(ylabel=feature_ylabel, fontsize=8)
        axes[j,1].set(xlabel='')
        for p in (0, 1, 2):
            y = hand_picked_features[hand_picked_features['Label']==str(p)][feature].dropna()
            x = np.random.normal(p, 0.04, size=len(y))
            axes[j,1].plot(x, y, 'r.', alpha=0.2)
        
plt.show()

In [44]:
print('\n Hand Picked Features\n')
feature_names = hand_picked_features.drop(['Channel', 'Label'], axis=1).columns
for feature in feature_names:
    label = (hand_picked_features['Label']=='2').astype(int)
    feature_values = hand_picked_features[feature].values
    eta = correlation_ratio(label, feature_values)
    print(feature + ': ')
    print(eta)


 Hand Picked Features

Mean: 
0.006447794931599672
Normalised Mean: 
0.09547942250217417


In [22]:
%matplotlib qt

fig = plt.figure(figsize=(16,9))
sns.boxplot(x='Label', y='DTW Distance', palette='Set2', data=dtw_df)
ylabel = re.sub(r'(:)', r'\1\n', 'DTW Distance')
plt.ylabel(ylabel, fontsize=8)
plt.xlabel('')
for p in (0, 1, 2):
    y = dtw_df[dtw_df['Label']==str(p)]['DTW Distance'].dropna()
    x = np.random.normal(p, 0.04, size=len(y))
    plt.plot(x, y, 'r.', alpha=0.2)
    
plt.show()

## Feature Implementations

In [4]:
# A shitty conduction delay detector
def get_delay(x, amp_thresh=None, set_thresh=False):
    if (set_thresh==True):
        if any(abs(x)>amp_thresh):
            return np.argmax(abs(x)>amp_thresh)
        else:
            return len(x)
    else:    
        return np.argmax(abs(x)>(max(abs(x))/2))
    
def denoise(x):
    # Obtain Daubechies N=6 wavelet coefficients
    waveletCoefs = pywt.wavedec(x, 'db7', mode='per')

    # Throw away coefficients corresponding to noise
    sigma = mad(waveletCoefs[-1])
    uThresh = 1*sigma*np.sqrt(2*np.log(len(x)))
    denoised = waveletCoefs[:]
    denoised[1:] = (pywt._thresholding.hard(i, value=uThresh) for i in denoised[1:])

    # Reconstruct the original signal
    xDenoised = pywt.waverec(denoised, 'db7', mode='per')

    return xDenoised

def get_peaks(x, height_thresh, scale_amp=None, set_scale=False, plot = False):
    x = np.array(x)
    
    # Get height_thresh
    if set_scale:
        height_thresh = height_thresh*scale_amp
    else:
        height_thresh = height_thresh*max(abs(x))
    
    # Denoise x
    xdn = denoise(x)

    # Detect peaks using detect_peaks
    pos_peak_idx = detect_peaks(xdn, mph=height_thresh, threshold = 0)
    neg_peak_idx = detect_peaks((-xdn), mph=height_thresh, threshold = 0)
    peak_idx = np.concatenate([pos_peak_idx, neg_peak_idx])
    peak_idx = np.sort(peak_idx)
    # Edge indeces aren't detected
    peak_idx = peak_idx[(peak_idx != 0) & (peak_idx != (len(xdn)-1))]

    new_peak_idx = []
    peak_amp = []
    if (len(peak_idx) > 0):
        new_peak_idx.append(peak_idx[0])
        mp_thresh = 0.2*max(abs(x))
        for i in range(len(peak_idx)-1):
            idx = peak_idx[i]
            idx_next = peak_idx[i+1]
            mid_point = int((idx_next+idx)/2)
            if (max([abs(x[idx_next]-x[mid_point]), abs(x[idx]-x[mid_point])]) > mp_thresh):
                new_peak_idx.append(idx_next)

        peak_idx = np.array(new_peak_idx)
        peak_amp = x[peak_idx]

    if plot == True:
        fig, [ax1] = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(8,8))
        ax1.plot(x, 'b' , xdn, 'r--', peak_idx, peak_amp, 'kx')
        #plt.title(fileName)
        ax1.set_xlabel('Sample')
        ax1.set_ylabel('Normalised amplitude')
        ax1.legend(['Original segment', 'Denoised segment', 'Detected peaks'])

        plt.draw()
        plt.waitforbuttonpress(0) # this will wait for indefinite time
        plt.close(fig)


    return peak_idx, peak_amp

def sample_entropy(U, m, r):

    def _maxdist(x_i, x_j):
        result = max([abs(ua-va) for ua, va in zip(x_i, x_j)])
        return result

    def _phi(m):
        x = np.zeros([N,m-1])
        for i in range(N-m+1):
            x[i,:] = U[i:i+m-1]

        C = 0
        for i in range(len(x)):
            for j in range(len(x)):
                if i != j:
                    if _maxdist(x[i,:], x[j,:]) <= r:
                        C = C + 1

        return C

    U = U/max(abs(U))
    N = len(U)

    return -np.log(_phi(m+1)/_phi(m))

def percentage_fractionation(x, peak_idxs, thresh=0.01, sr=1000):
    # Get peak indexes and amplitude
    peak_idx_diffs = np.diff(peak_idxs)
    frac_time = 0
    frac_time = np.sum(peak_idx_diffs[peak_idx_diffs < thresh*sr])
    prcnt_frac = (frac_time/len(x))*100
    return prcnt_frac

def get_local_sample_entropy(x, centre_idx, width, m=2, r=0.05):
    # Ensure width is odd
    if ((width%2) == 0):
        width += 1
        
    if (centre_idx < (width-1)/2):
        return sample_entropy(x[:width+1], m, r)
    elif (centre_idx > (len(x)-1-(width-1)/2)):
        return sample_entropy(x[len(x)-1-width:], m, r)
    else:
        return sample_entropy(x[int(centre_idx-(width-1)/2):int(centre_idx+(width+1)/2)], m, r)
    
def get_location_of_max_energy(x, M=14):
    v = np.ones(M)
    x_ = np.convolve(abs(x), v)
    return (np.argmax(x_) + math.floor(M/2))
        
def get_local_peaks(x, centre_idx, width=25, height_thresh=0.1):
    if ((width%2) == 0):
        width += 1
        
    if (centre_idx < (width-1)/2):
        return get_peaks(x[:width+1], height_thresh)
    elif (centre_idx > (len(x)-1-(width-1)/2)):
        return get_peaks(x[len(x)-1-width:], height_thresh)
    else:
        return get_peaks(x[int(centre_idx-(width-1)/2):int(centre_idx+(width+1)/2)], height_thresh)
    
def get_pse(x):
    x_fft = np.fft.rfft(x)
    x_P = (1/len(x_fft))*np.absolute(x_fft)**2
    x_p = x_P/sum(x_P)
    pse = np.sum([(-p*np.log2(p)) for p in x_p])
    return pse

def get_local_pse(x, centre_idx, width=50):
    if ((width%2) == 0):
        width += 1
        
    if (centre_idx < (width-1)/2):
        return get_pse(x[:width+1])
    elif (centre_idx > (len(x)-1-(width-1)/2)):
        return get_pse(x[len(x)-1-width:])
    else:
        return get_pse(x[int(centre_idx-(width-1)/2):int(centre_idx+(width+1)/2)])
    
def get_spectral_centroid(x):
    x_fft = np.fft.rfft(x)
    x_spectrum = np.absolute(x_fft)
    normalized_spectrum = x_spectrum/sum(x_spectrum)
    normalized_frequencies = np.arange(0, len(x_spectrum), 1)
    return sum(normalized_frequencies * normalized_spectrum)

def get_local_spectral_centroid(x, centre_idx, width=50):
    if ((width%2) == 0):
        width += 1
        
    if (centre_idx < (width-1)/2):
        return get_spectral_centroid(x[:width+1])
    elif (centre_idx > (len(x)-1-(width-1)/2)):
        return get_spectral_centroid(x[len(x)-1-width:])
    else:
        return get_spectral_centroid(x[int(centre_idx-(width-1)/2):int(centre_idx+(width+1)/2)])
    
def get_local_energy(x, centre_idx, width=60):
    if ((width%2) == 0):
        width += 1
        
    if (centre_idx < (width-1)/2):
        return np.sum(x[:width+1]**2)
    elif (centre_idx > (len(x)-1-(width-1)/2)):
        return np.sum(x[len(x)-1-width:]**2)
    else:
        return np.sum(x[int(centre_idx-(width-1)/2):int(centre_idx+(width+1)/2)]**2)
    
def get_width_max_energy(x, M=14, width_thresh=0.2):
    v = np.ones(M)
    x_ = np.convolve(abs(x), v)
    if any(x_[np.argmax(x_):] < width_thresh*np.max(x_)):
        end_idx = np.argmax(x_) + np.argmax(x_[np.argmax(x_):] < width_thresh*np.max(x_))
    else:
        end_idx = len(x_)-1
    if any(x_[np.argmax(x_)::-1] < width_thresh*np.max(x_)):  
        start_idx = np.argmax(x_) - np.argmax(x_[np.argmax(x_)::-1] < width_thresh*np.max(x_))
    else:
        start_idx = 0

    return (end_idx - start_idx)

In [8]:
def get_hand_picked_feature_dict(x, col_prefix=''):
    feature_dict = {}
    
    feature_dict[col_prefix + 'Mean'] = np.mean(abs(x))
    feature_dict[col_prefix + 'Normalised Mean'] = np.mean(abs(x)/max(abs(x)))
    
    return feature_dict

def get_hand_engineered_feature_dict(x, col_prefix = ''):
    feature_dict = {}
    # Hand engineered features
    
    min_idx = np.argmin(x)
    max_idx = np.argmax(x)
    
    height_thresh=0.1
    peaks = get_peaks(x, height_thresh)
    feature_dict[col_prefix + 'Number of Peaks: thresh=0.1'] = len(peaks[0])
    feature_dict[col_prefix + 'Percentage Fractionation: thresh=0.1'] = percentage_fractionation(x, peaks[0], thresh=0.01)
    feature_dict[col_prefix + 'Peaks Between Min and Max: thresh=0.1'] = len([i for i in peaks[0] if ((i > min_idx) & (i < max_idx))])
    
    height_thresh=0.2
    peaks = get_peaks(x, height_thresh)
    feature_dict[col_prefix + 'Number of Peaks: thresh=0.2'] = len(peaks[0])
    feature_dict[col_prefix + 'Percentage Fractionation: thresh=0.2'] = percentage_fractionation(x, peaks[0], thresh=0.01)
    feature_dict[col_prefix + 'Peaks Between Min and Max: thresh=0.2'] = len([i for i in peaks[0] if ((i > min_idx) & (i < max_idx))])

    
    max_energy_idx = get_location_of_max_energy(x, M=14)
    feature_dict[col_prefix + 'Location of Maximum Energy: M=14'] = max_energy_idx
    max_energy_idx = get_location_of_max_energy(x, M=10)
    feature_dict[col_prefix + 'Location of Maximum Energy: M=10'] = max_energy_idx
    
#     feature_dict[col_prefix + 'Sample Entropy Around Max Energy: width=30 r=0.15 m=3'] = get_local_sample_entropy(x, max_energy_idx, 60, m=3, r=0.15)
    
    feature_dict[col_prefix + 'Width of Maximum Energy: M=14, width_thresh=0.2'] = get_width_max_energy(x, M=14, width_thresh=0.2)

    return feature_dict

def get_spectral_feature_dict(x, col_prefix = ''):
    feature_dict = {}
    feature_dict[col_prefix + 'Power Spectral Entropy'] = get_pse(x)
    feature_dict[col_prefix + 'Spectral Centroid'] = get_spectral_centroid(x)
    max_energy_idx = get_location_of_max_energy(x)
    feature_dict[col_prefix + 'Power Spectral Entropy Around Maximum Energy: width=60'] = get_local_pse(x, max_energy_idx, width=60)
    feature_dict[col_prefix + 'Spectral Centroid Around Maximum Energy: width=60'] = get_local_spectral_centroid(x, max_energy_idx, width=60)
    
    return feature_dict
    
def get_temporal_feature_dict(x, col_prefix = ''):

    feature_dict = {}
    feature_dict[col_prefix + 'Maximum Absolute Value'] = np.max(abs(x))
    
#     feature_dict[col_prefix + 'Approximate Entropy: m=3 r=0.7'] = feature_calculators.approximate_entropy(x, 3, 0.7)
    feature_dict[col_prefix + 'Ratio Beyond 1xSTD'] = feature_calculators.ratio_beyond_r_sigma(x, 1)
    # A fraction q of the mass lies to the left of i. (Alternative to conduction delay?)
    

    return feature_dict