# Feature engineering

    a. scaling
    b. manual feature generation
    c. organize data layout

In [1]:
# Module imports
import warnings

warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
from sklearn.preprocessing import StandardScaler
from scipy.stats import skew, kurtosis

In [2]:
# Load data
root = Path('/tmp/working/fang/data')  # use the current directory as the root
data_file = root / 'lpp_step_1_load.pkl'
data_df = pickle.load(data_file.open('rb'))
data_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0,1,2,3,4,5,6,7,8,9,...,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399
healthy,positive/neutral,id,channel,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
True,0,1055,FP1,1.4035,1.229,1.0458,0.8607,0.6807,0.5115,0.3568,0.2182,0.0954,-0.0129,...,-0.3979,-0.3179,-0.2428,-0.1725,-0.1067,-0.0451,0.0124,0.0535,0.092,0.1309
True,0,1055,Fz,0.4953,0.3816,0.2665,0.1513,0.0373,-0.0747,-0.1844,-0.2913,-0.3942,-0.4908,...,-3.349,-3.2514,-3.1495,-3.0465,-2.946,-2.8515,-2.7657,-2.6993,-2.6447,-2.6018
True,0,1055,F3,-0.0294,-0.1011,-0.1702,-0.2354,-0.2952,-0.3479,-0.3926,-0.4288,-0.4562,-0.4742,...,-5.5209,-5.5369,-5.5607,-5.5932,-5.6342,-5.6824,-5.7358,-5.7937,-5.8516,-5.9063
True,0,1055,F7,0.2995,0.2385,0.187,0.1469,0.12,0.1063,0.1039,0.1098,0.1214,0.1383,...,0.0659,0.0373,0.0035,-0.0364,-0.0821,-0.1329,-0.1872,-0.2437,-0.2992,-0.3493
True,0,1055,FT9,-0.0859,-0.1063,-0.1262,-0.1449,-0.1617,-0.1753,-0.1848,-0.1891,-0.1876,-0.18,...,-0.4807,-0.4755,-0.4666,-0.454,-0.4381,-0.42,-0.4008,-0.3817,-0.3645,-0.3502


### a. Scaling

In [3]:
# scaling

values = StandardScaler().fit_transform(data_df.T).T
scaled_df = pd.DataFrame(values, index=data_df.index, columns=data_df.columns)
scaled_df.head().T.describe()  # zero mean and unit standard deviation expected

healthy,True,True,True,True,True
positive/neutral,0,0,0,0,0
id,1055,1055,1055,1055,1055
channel,FP1,Fz,F3,F7,FT9
count,1400.0,1400.0,1400.0,1400.0,1400.0
mean,-4.060244e-17,6.090366000000001e-17,-1.218073e-16,1.624098e-16,2.030122e-16
std,1.000357,1.000357,1.000357,1.000357,1.000357
min,-1.858195,-1.607354,-1.508808,-1.686597,-2.148432
25%,-0.8933267,-0.876411,-0.8465499,-0.9326857,-0.741138
50%,0.1478106,-0.09726929,-0.2162883,-0.1015857,-0.1118703
75%,0.8228506,0.728175,0.4960456,0.9468201,0.8088878
max,2.010094,2.135078,2.164978,1.80825,2.537074


In [4]:
# save data
path1 = root / 'lpp_step_2_scaled.pkl'
pickle.dump(scaled_df, path1.open('wb'))

### b. feature generation


In [5]:
def features_from_functions(data_df, functions):
    """
    Function to produce a dataframe of the results of a list of functions
    
    :param data_df: a dataframe as input, each row represents a sample and each column represents a feature
    :param functions: a list of two-tuple of the name and the reference to a function
    :return: a dataframe of the horizontally stacked results of the application of function on rows of the input dataframe
    """
    names = [name for name, _ in functions]
    features = [data_df.apply(func, axis=1) for _, func in functions]
    feature_df = pd.concat(features, axis=1, keys=names)
    feature_df.index = data_df.index
    return feature_df

def select_channels(data_df, channels):
    return data_df.loc[(slice(None), slice(None), channels), :].unstack('channel')

def signaltonoise(row):
    """
    Calculate the signal to noise metric for a row
    """
    return np.mean(row) / np.std(row)

In [6]:
# List the feature generating functions here.
# Give each function a name as the first string in the tuple.
functions = [
    ('max', np.amax),
    ('min', np.amin),
    ('range', np.ptp),
    ('std', np.std),
    ('avg', np.mean),
    ('skew', skew),
    ('kurtosis', kurtosis),
    ('signaltonoise', signaltonoise)
]


In [7]:
#  Generate the feature dataframe containing all features for all channels
features_df = features_from_functions(data_df, functions)
features_df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,max,min,range,std,avg,skew,kurtosis,signaltonoise
healthy,positive/neutral,id,channel,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
True,0,1055,FP1,2.0284,-8.1509,10.1793,2.631473,-3.26111,-0.132195,-1.09073,-1.239271
True,0,1055,Fz,1.8268,-11.5478,13.3746,3.573772,-5.803482,0.240272,-0.917673,-1.623909
True,0,1055,F3,1.2862,-12.2165,13.5027,3.675418,-6.671,0.623551,-0.636454,-1.815032
True,0,1055,F7,1.1601,-6.4305,7.5906,2.171941,-2.767312,0.18626,-1.292261,-1.27412
True,0,1055,FT9,0.8369,-3.1869,4.0238,0.858776,-1.341878,0.270833,-0.698081,-1.562548


In [8]:
#  Calculate the difference between the positive and neutral signals of the same channel
diff_df = data_df.xs(1, level=1) - data_df.xs(0, level=1)

#  Standization
values = StandardScaler().fit_transform(diff_df.T).T
diff_scaled_df = pd.DataFrame(values, index=diff_df.index, columns=diff_df.columns)

#  Generate the features on difference data, based on data before standazation
diff_features_df = features_from_functions(diff_df, functions)
diff_features_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,max,min,range,std,avg,skew,kurtosis,signaltonoise
healthy,id,channel,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
False,2001,C3,3.7442,-5.4536,9.1978,1.739813,-0.657218,-0.343139,0.159621,-0.377752
False,2001,C4,5.2597,-4.9209,10.1806,1.943314,1.015727,-0.536508,0.66305,0.522678
False,2001,CP1,2.7229,-5.2202,7.9431,1.60667,-0.844072,-0.234715,-0.578945,-0.525355
False,2001,CP2,5.1912,-5.6636,10.8548,1.948676,0.10246,-0.35025,0.395914,0.05258
False,2001,CP5,3.1503,-8.7379,11.8882,2.447677,-2.736762,-0.147619,-0.483465,-1.118106


### c. organize data

In [9]:
# union gain and loss signals
# Union gain, loss and diff signals
unioned_df = pd.concat(
    [scaled_df.xs(1, level=1), 
     scaled_df.xs(0, level=1), 
     diff_scaled_df], 
    keys=['positive', 'neutral', 'lpp diff'], 
    axis=1)
unioned_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,positive,positive,positive,positive,positive,positive,positive,positive,positive,positive,...,lpp diff,lpp diff,lpp diff,lpp diff,lpp diff,lpp diff,lpp diff,lpp diff,lpp diff,lpp diff
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,0,1,2,3,4,5,6,7,8,9,...,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399
healthy,id,channel,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
False,2001,C3,0.822947,0.810574,0.790177,0.771591,0.757044,0.747518,0.742807,0.73499,0.728363,0.721064,...,0.035876,-0.094713,-0.249557,-0.42699,-0.623965,-0.834562,-1.050908,-1.264149,-1.463998,-1.643213
False,2001,C4,-0.287405,-0.281127,-0.273582,-0.260444,-0.241977,-0.219131,-0.193331,-0.169641,-0.14574,-0.122683,...,0.107586,0.058134,0.00081,-0.059963,-0.120581,-0.177237,-0.227151,-0.268781,-0.300583,-0.324305
False,2001,CP1,1.068188,1.047109,1.015555,0.986241,0.960089,0.937429,0.918062,0.890461,0.863848,0.837301,...,0.645292,0.553239,0.436911,0.297306,0.135667,-0.044021,-0.235909,-0.432714,-0.623854,-0.80292
False,2001,CP2,-0.114372,-0.139265,-0.167956,-0.189835,-0.203819,-0.209485,-0.207315,-0.206772,-0.201287,-0.192668,...,1.019687,0.972476,0.909202,0.832945,0.74663,0.653952,0.559015,0.465516,0.378739,0.299916
False,2001,CP5,0.376856,0.390188,0.404805,0.424179,0.448691,0.477994,0.510699,0.542433,0.57316,0.600796,...,-1.054158,-1.113847,-1.181462,-1.257044,-1.340184,-1.429208,-1.521581,-1.614036,-1.702283,-1.782686


In [10]:
unioned_feature_df = pd.concat(
    [features_df.xs(1, level=1), 
     features_df.xs(0, level=1),
     diff_features_df], 
    keys=['positive', 'neutral', 'lpp diff'], 
    axis=1)
unioned_feature_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,positive,positive,positive,positive,positive,positive,positive,positive,neutral,neutral,neutral,neutral,neutral,lpp diff,lpp diff,lpp diff,lpp diff,lpp diff,lpp diff,lpp diff,lpp diff
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,max,min,range,std,avg,skew,kurtosis,signaltonoise,max,min,...,kurtosis,signaltonoise,max,min,range,std,avg,skew,kurtosis,signaltonoise
healthy,id,channel,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
False,2001,C3,2.1334,-5.7414,7.8748,1.931611,-0.797114,-1.083241,0.555988,-0.412668,4.803,-6.7964,...,0.17256,-0.060329,3.7442,-5.4536,9.1978,1.739813,-0.657218,-0.343139,0.159621,-0.377752
False,2001,C4,4.7859,-6.6008,11.3867,1.895307,0.721121,-0.943935,1.845432,0.380477,3.4056,-6.3681,...,0.457037,-0.153781,5.2597,-4.9209,10.1806,1.943314,1.015727,-0.536508,0.66305,0.522678
False,2001,CP1,2.6912,-4.4404,7.1316,1.518056,-0.60987,-0.067931,-0.475303,-0.401744,3.4386,-4.5419,...,0.307061,0.163722,2.7229,-5.2202,7.9431,1.60667,-0.844072,-0.234715,-0.578945,-0.525355
False,2001,CP2,3.8585,-4.584,8.4425,1.659078,0.627152,-0.259761,0.110769,0.378012,4.1357,-4.0241,...,0.170597,0.352152,5.1912,-5.6636,10.8548,1.948676,0.10246,-0.35025,0.395914,0.05258
False,2001,CP5,4.7617,-7.0327,11.7944,2.880233,-1.316733,0.074903,-0.717626,-0.457162,5.461,-2.6329,...,-0.606363,0.825785,3.1503,-8.7379,11.8882,2.447677,-2.736762,-0.147619,-0.483465,-1.118106


In [11]:
unioned_df_file = root / 'lpp_step_2_unioned.pkl'
unioned_feature_df_file = root / 'lpp_step_2_unioned_feature.pkl'
pickle.dump(unioned_df, unioned_df_file.open('wb'))
pickle.dump(unioned_feature_df, unioned_feature_df_file.open('wb'))

In [12]:
FS = 1000  # sampling rate, 1000 Hz
POINTS = 1400  # total data points per sample, 1.5 seconds, 1000 Hz, 1500 data points

def fft_func(row):
    """
    Functions to be used in feature generation
    """
    freq = np.fft.rfftfreq(POINTS, d = 1.0/FS)
    fft = np.absolute(np.fft.rfft(row).real).flatten()
    # Define EEG bands
    eeg_bands = {'Delta': (0, 4),
                 'Theta': (4, 8),
                 'Alpha': (8, 12),
                 'Beta': (12, 30),
                 'Gamma': (30, 45)}

    # Take the mean of the fft amplitude for each EEG band
    eeg_band_fft = dict()
    bands = eeg_bands.keys()
    for band in bands:  
        freq_ix = np.where((freq >= eeg_bands[band][0]) & 
                           (freq <= eeg_bands[band][1]))[0]
        eeg_band_fft[band] = np.mean(fft[freq_ix])
    values = [eeg_band_fft[band] for band in bands]
    result = pd.Series(values, index=bands)
    return result

In [13]:
# Prepare bands feature
functions = [
    ('bands', fft_func)
]

bands_features_df = features_from_functions(data_df, functions)
# union gain and loss signals
unioned_bands_feature_df = pd.concat(
    [bands_features_df.xs(1, level=1), 
     bands_features_df.xs(0, level=1)], 
    keys=['positive', 'neutral'], 
    axis=1)

unioned_bands_feature_df.columns = unioned_bands_feature_df.columns.droplevel(level=1)
unioned_bands_feature_df_file = Path('/tmp/working/fang/data/lpp_step_2_bands_feature_df.pkl')
pickle.dump(unioned_bands_feature_df, unioned_bands_feature_df_file.open('wb'), protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
unioned_bands_feature_df.describe()

Unnamed: 0_level_0,positive,positive,positive,positive,positive,neutral,neutral,neutral,neutral,neutral
Unnamed: 0_level_1,Delta,Theta,Alpha,Beta,Gamma,Delta,Theta,Alpha,Beta,Gamma
count,3597.0,3597.0,3597.0,3597.0,3597.0,3597.0,3597.0,3597.0,3597.0,3597.0
mean,1028.94157,180.258808,135.861523,63.936788,23.361387,970.857919,178.106617,136.732829,65.160201,22.990843
std,689.985852,102.968218,87.498774,39.678239,22.644019,664.35207,101.81729,97.82617,34.399149,19.66544
min,57.535122,4.825431,7.923289,11.18972,3.894709,42.551132,16.834114,15.318947,12.608163,3.789262
25%,533.705792,107.54675,79.392337,43.387178,13.287994,471.651245,105.874561,77.153014,45.226372,13.402102
50%,870.209724,159.538282,116.328096,56.548319,18.196464,815.348777,157.464785,112.082829,57.530189,18.485059
75%,1337.167675,225.929544,169.13733,74.341399,27.050989,1289.194497,221.985533,166.412,76.044918,26.06004
max,6535.909427,1911.127626,1294.529808,1264.127813,873.15511,5685.997088,994.174222,1164.371105,681.847813,603.787593
