In [1]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from utils.feature_selection import *

# Feature Extraction

In [2]:
final_dataset_path = '../dataset/final/whole/'
acc_dataset_path = final_dataset_path + 'PatchTable_Acceleration_Normalised.csv'
gyro_dataset_path = final_dataset_path + 'PatchTable_Gyroscope_Normalised.csv'

In [3]:
acc_dataset_df = pd.read_csv(acc_dataset_path).iloc[:, 1:]
gyro_dataset_df = pd.read_csv(gyro_dataset_path).iloc[:, 1:]
acc_dataset_df = acc_dataset_df.drop('ClassificationType', axis=1)
gyro_dataset_df = gyro_dataset_df.drop('ClassificationType', axis=1)

### Feature Importance

In [4]:
# Separate features and target variable
acc_features = acc_dataset_df.drop(columns=['Class', ])
acc_target = acc_dataset_df['Class']
gyro_features = gyro_dataset_df.drop(columns=['Class'])
gyro_target = gyro_dataset_df['Class']

In [5]:
acc_random_forest_model = RandomForestClassifier()
gyro_random_forest_model = RandomForestClassifier()
acc_random_forest_model.fit(acc_features, acc_target)
gyro_random_forest_model.fit(gyro_features, gyro_target)

In [6]:
# Get feature importance
acc_feature_importance = acc_random_forest_model.feature_importances_
gyro_feature_importance = gyro_random_forest_model.feature_importances_

acc_best_features = []
gyro_best_features = []

for i in range(len(acc_feature_importance)):
    if acc_feature_importance[i] > 0:
        acc_best_features.append(acc_dataset_df.columns[i + 1])

for i in range(len(gyro_feature_importance)):
    if gyro_feature_importance[i] > 0:
        gyro_best_features.append(gyro_dataset_df.columns[i + 1])


In [7]:
filtered_acc_dataset_df = pd.concat([acc_dataset_df.iloc[:, :1], acc_dataset_df[acc_best_features].copy()], axis=1)
filtered_gyro_dataset_df = pd.concat([acc_dataset_df.iloc[:, :1], gyro_dataset_df[gyro_best_features].copy()], axis=1)

In [8]:
filtered_acc_dataset_df

Unnamed: 0,Class,DominantFreq_VT,DominantFreq_AP,CorssCorrelationSum_VTML,CorssCorrelationSum_VTAP,CorssCorrelationSum_MLAP,CorssCorrelationLag_VTML,CorssCorrelationPeak_VTAP,CorssCorrelationLag_VTAP,CorssCorrelationPeak_MLAP,...,RelativePower_AboveDomFr_Magnitude,CentroidSpMean_Magnitude,CentroidSpMax_Magnitude,MeanPSD_Plomb_Magnitude,MedianPSD_Plom_Magnitude,MeanFreq_Plom_Magnitude,SumPSD_Plom_Magnitude,Moment2ndPSD_Plom_Magnitude,ZeroCrossingRate_Magnitude,EnergySignal_Magnitude
0,0,-0.565279,-0.349227,-0.339676,-0.226463,-0.190883,-0.226534,-0.699258,-1.051128,0.163948,...,-0.301511,-0.092303,-0.092303,-1.140718,-0.381468,-0.492091,-1.027345,-0.905046,-0.737538,-1.027326
1,1,-0.642362,-0.727585,1.518135,3.290363,2.295714,3.136402,2.605245,1.841974,0.318755,...,-0.301511,2.227281,2.227281,-0.474955,-0.404628,-1.133617,1.839771,-0.590126,0.634839,1.839808
2,0,-0.398265,2.833913,-0.617424,-0.405724,-0.812518,0.023633,-0.567433,-0.171941,-0.62274,...,-0.301511,-0.108542,-0.108542,-0.690686,-0.406813,-0.286002,-0.668854,-0.704978,-0.630665,-0.668855
3,1,2.742887,-0.081909,1.032493,0.018871,-0.037891,-0.629212,1.641298,1.36585,-0.69708,...,-0.301511,1.621542,1.621542,-0.649378,-0.430559,-0.912159,0.351717,-0.721528,-0.473802,0.351628
4,0,-0.481772,-0.25875,-0.7384,-0.413833,-0.826791,-0.177941,-0.537018,-0.426171,-0.51752,...,-0.301511,-0.473631,-0.473631,0.574786,-0.426267,-0.072173,0.307707,0.30175,0.190848,0.307731
5,1,-0.969967,-0.505505,-0.580067,-0.290057,1.389306,0.075973,-0.198954,0.620638,2.784512,...,3.316625,-0.277505,-0.277505,1.005212,-0.395151,-0.35472,1.231292,0.953839,0.005452,1.231322
6,0,0.738717,1.369833,-0.736495,-0.428313,-0.848734,-0.807192,-1.005746,-1.302736,-0.842085,...,-0.301511,-1.897663,-1.897663,1.830482,3.038069,2.663853,-0.58073,2.223116,0.409136,-0.580734
7,1,-0.687328,-0.472604,-0.693986,-0.325986,-0.782489,-0.444676,-0.347422,-0.897052,-0.714855,...,-0.301511,0.088414,0.088414,-0.602498,1.047793,1.433947,-1.212053,-0.679406,1.500628,-1.212035
8,0,0.321181,-0.645333,-0.655124,-0.408778,-0.765699,-0.193455,-0.261149,-0.447758,-0.563378,...,-0.301511,-0.526397,-0.526397,-0.312869,-0.427505,-0.049008,-0.496495,-0.416946,-0.681959,-0.496477
9,1,0.006424,-0.49728,-0.378319,-0.353833,-0.562437,0.054253,-0.118487,-0.067417,-0.6363,...,-0.301511,0.002039,0.002039,-1.34589,-0.421828,-0.37263,-1.318803,-0.955736,1.265671,-1.318813


In [9]:
filtered_gyro_dataset_df

Unnamed: 0,Class,DominantFreq_VT,DominantFreq_ML,CorssCorrelationPeak_VTAP,CorssCorrelationPeak_MLAP,CorssCorrelationLag_MLAP,Amplitude_ML,AmplitudeNorm_VT,AmplitudeNorm_AP,Width_ML,...,ZeroCrossingRate_AP,EnergySignal_AP,Std_SigComplete_Magnitude,InterQuartileRange_SigComplete_Magnitude,Percentile75_SigComplete_Magnitude,SumSq75_SigComplete_Magnitude,RelativePower_Below5Hz_Magnitude,MeanPSD_Plomb_Magnitude,ZeroCrossingRate_Magnitude,ModeSignal_Magnitude
0,0,0.727909,-0.996089,0.448143,-0.701455,-0.220192,-0.979382,-1.19134,-1.360963,-0.736323,...,-0.662867,-0.307579,0.073883,0.060873,-0.042897,0.221407,0.964929,0.036056,-0.201971,1.140569
1,1,0.605287,0.932871,1.800904,-0.225887,2.695983,-0.969683,-2.039258,-1.776933,-0.954785,...,-0.433195,1.675297,-0.961825,-0.668794,-0.68316,2.507606,-0.132561,-0.970511,0.284359,-0.176969
2,0,-1.172733,1.459405,-0.707054,1.314729,-0.15415,-0.155148,1.23766,0.059221,0.397288,...,-0.696122,0.249795,-0.878979,-0.89461,-0.503766,-0.218254,-0.592845,-0.891415,1.849907,-0.386439
3,1,1.178162,-1.038511,0.182726,2.567063,1.2722,0.344264,-0.777456,0.947033,0.667341,...,-0.531354,1.541128,-1.665824,-0.961254,-0.565357,1.031284,-1.111037,-1.598924,1.827208,0.063733
4,0,1.524952,0.196722,-0.355487,-0.660971,-0.378274,2.236713,0.72303,-0.014811,-0.499875,...,-0.32149,0.132507,0.749743,0.423798,0.527352,-0.376234,0.941986,0.731691,-0.173902,0.772013
5,1,0.800716,-0.207536,-0.408242,-0.058896,-0.06877,1.45541,0.437516,1.331756,1.101773,...,1.007367,0.485058,1.358672,1.399631,1.657079,0.402317,-0.79366,1.395239,-0.522966,-0.065989
6,0,-1.00796,-0.786473,-0.722907,-0.956839,-1.23601,-0.6491,0.609789,0.067503,2.190991,...,0.93206,-1.662963,0.468387,0.926366,0.782152,-1.242935,1.998529,0.441121,-0.08042,-2.156054
7,1,-1.046279,1.110046,-0.991809,-0.934588,-1.026892,-0.720684,0.721073,1.693719,-1.118354,...,1.343756,-1.468062,0.027765,-0.411536,-0.604682,-1.24502,-0.502118,-0.014575,-0.564841,-0.943992
8,0,-0.87001,1.439442,-0.811131,-0.610368,-0.392718,-0.436874,0.567927,-0.731522,0.828515,...,-1.771286,-0.315299,1.048956,0.878272,0.850269,-0.339278,0.978372,1.056377,-1.648707,0.527311
9,1,-0.682245,-1.175759,-0.97606,-0.049916,-0.043894,-0.026449,-1.201571,0.481633,-0.88303,...,-1.035943,-1.056554,-1.305696,-1.911917,-2.272928,-0.855361,-1.648869,-1.284298,-1.225724,1.828185


### Correlation Selection

In [10]:
acc_corr = filtered_acc_dataset_df.corr()
gyro_corr = filtered_gyro_dataset_df.corr()

In [11]:
acc_corr

Unnamed: 0,Class,DominantFreq_VT,DominantFreq_AP,CorssCorrelationSum_VTML,CorssCorrelationSum_VTAP,CorssCorrelationSum_MLAP,CorssCorrelationLag_VTML,CorssCorrelationPeak_VTAP,CorssCorrelationLag_VTAP,CorssCorrelationPeak_MLAP,...,RelativePower_AboveDomFr_Magnitude,CentroidSpMean_Magnitude,CentroidSpMax_Magnitude,MeanPSD_Plomb_Magnitude,MedianPSD_Plom_Magnitude,MeanFreq_Plom_Magnitude,SumPSD_Plom_Magnitude,Moment2ndPSD_Plom_Magnitude,ZeroCrossingRate_Magnitude,EnergySignal_Magnitude
Class,1.000000,0.203415,-0.443131,0.119361,0.337073,0.393101,0.377699,0.576362,0.347575,0.240162,...,0.301511,0.587947,0.587947,-0.077441,-0.168576,-0.233972,0.349275,-0.078178,0.623688,0.349266
DominantFreq_VT,0.203415,1.000000,0.084006,0.067744,-0.137115,-0.274873,-0.263909,0.295082,0.053454,-0.434741,...,-0.292456,0.186090,0.186090,0.076212,0.119722,0.017723,0.011872,0.107477,0.038432,0.011843
DominantFreq_AP,-0.443131,0.084006,1.000000,-0.252446,-0.253861,-0.395697,-0.220323,-0.331762,-0.242169,-0.318744,...,-0.152416,-0.324431,-0.324431,0.098966,0.336378,0.314709,-0.284217,0.149649,-0.164761,-0.284222
CorssCorrelationSum_VTML,0.119361,0.067744,-0.252446,1.000000,0.540556,0.688697,0.263320,0.587574,0.815266,0.228862,...,-0.174897,0.529325,0.529325,-0.134166,-0.291397,-0.473959,0.477327,-0.212600,-0.498823,0.477319
CorssCorrelationSum_VTAP,0.337073,-0.137115,-0.253861,0.540556,1.000000,0.735542,0.920991,0.838128,0.625966,0.121121,...,-0.087455,0.733443,0.733443,-0.173813,-0.161104,-0.397730,0.580007,-0.213911,0.137633,0.580016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MeanFreq_Plom_Magnitude,-0.233972,0.017723,0.314709,-0.473959,-0.397730,-0.517166,-0.447928,-0.567279,-0.674190,-0.328861,...,-0.106952,-0.711225,-0.711225,0.486960,0.938755,1.000000,-0.429093,0.570855,0.318175,-0.429090
SumPSD_Plom_Magnitude,0.349275,0.011872,-0.284217,0.477327,0.580007,0.786588,0.549827,0.588084,0.616781,0.595476,...,0.371248,0.440110,0.440110,0.462380,-0.312301,-0.429093,1.000000,0.346060,-0.084557,1.000000
Moment2ndPSD_Plom_Magnitude,-0.078178,0.107477,0.149649,-0.212600,-0.213911,0.004172,-0.209655,-0.332946,-0.316368,0.260248,...,0.287593,-0.582096,-0.582096,0.977716,0.557515,0.570855,0.346060,1.000000,0.117188,0.346059
ZeroCrossingRate_Magnitude,0.623688,0.038432,-0.164761,-0.498823,0.137633,-0.181354,0.308172,0.127491,-0.331129,-0.246788,...,0.001644,0.106932,0.106932,0.048620,0.293868,0.318175,-0.084557,0.117188,1.000000,-0.084554


In [12]:
gyro_corr

Unnamed: 0,Class,DominantFreq_VT,DominantFreq_ML,CorssCorrelationPeak_VTAP,CorssCorrelationPeak_MLAP,CorssCorrelationLag_MLAP,Amplitude_ML,AmplitudeNorm_VT,AmplitudeNorm_AP,Width_ML,...,ZeroCrossingRate_AP,EnergySignal_AP,Std_SigComplete_Magnitude,InterQuartileRange_SigComplete_Magnitude,Percentile75_SigComplete_Magnitude,SumSq75_SigComplete_Magnitude,RelativePower_Below5Hz_Magnitude,MeanPSD_Plomb_Magnitude,ZeroCrossingRate_Magnitude,ModeSignal_Magnitude
Class,1.000000,-0.030496,-0.264930,0.007659,0.160739,0.406600,0.159197,-0.483660,0.465709,-0.199984,...,0.180223,0.246117,-0.478409,-0.476241,-0.438862,0.251790,-0.723573,-0.472381,0.052033,-0.021881
DominantFreq_VT,-0.030496,1.000000,-0.187647,0.528043,0.272645,0.463173,0.317635,-0.256582,-0.202340,-0.231573,...,0.026352,0.601496,0.163796,0.259186,0.286434,0.592619,-0.016445,0.180730,0.139760,0.410720
DominantFreq_ML,-0.264930,-0.187647,1.000000,-0.039576,-0.074862,0.075118,-0.215588,0.401313,-0.136449,-0.132242,...,-0.128346,0.065033,0.235888,0.158327,0.175666,0.100594,0.080923,0.231785,-0.065928,-0.119277
CorssCorrelationPeak_VTAP,0.007659,0.528043,-0.039576,1.000000,0.205104,0.603406,-0.291465,-0.342827,-0.623505,-0.402738,...,0.233900,0.642764,0.056145,0.199354,0.147938,0.746735,0.025583,0.065104,0.248352,0.053139
CorssCorrelationPeak_MLAP,0.160739,0.272645,-0.074862,0.205104,1.000000,0.444897,0.055136,-0.040398,0.166916,0.090554,...,-0.157170,0.615989,-0.488935,-0.315640,-0.187161,0.405460,-0.560691,-0.463927,0.734140,0.112674
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SumSq75_SigComplete_Magnitude,0.251790,0.592619,0.100594,0.746735,0.405460,0.938340,-0.113367,-0.577289,-0.500737,-0.217734,...,-0.170754,0.900768,-0.231661,-0.050598,0.018050,1.000000,-0.209695,-0.217962,0.351658,0.177858
RelativePower_Below5Hz_Magnitude,-0.723573,-0.016445,0.080923,0.025583,-0.560691,-0.366954,-0.138083,0.252734,-0.460023,0.354431,...,0.036861,-0.318038,0.535049,0.602020,0.556716,-0.209695,1.000000,0.518283,-0.221574,-0.344304
MeanPSD_Plomb_Magnitude,-0.472381,0.180730,0.231785,0.065104,-0.463927,-0.477266,0.143643,0.585576,-0.055095,0.184572,...,0.393712,-0.226943,0.999376,0.941038,0.869382,-0.217962,0.518283,1.000000,-0.514549,-0.096360
ZeroCrossingRate_Magnitude,0.052033,0.139760,-0.065928,0.248352,0.734140,0.344689,0.069163,0.014920,0.056328,0.130294,...,0.077570,0.537290,-0.519026,-0.300473,-0.132391,0.351658,-0.221574,-0.514549,1.000000,-0.318817


In [21]:
acc_feature_corr_matrix = acc_corr.drop(columns=['Class'])
acc_target_corr = acc_feature_corr_matrix.iloc[[0]].copy()
acc_feature_corr_matrix = acc_feature_corr_matrix.drop(acc_feature_corr_matrix.index[0])
gyro_feature_corr_matrix = gyro_corr.drop(columns=['Class'])
gyro_target_corr = gyro_feature_corr_matrix.iloc[[0]].copy()
gyro_feature_corr_matrix = gyro_feature_corr_matrix.drop(gyro_feature_corr_matrix.index[0])

In [22]:
acc_corr_redundant_features = get_corr_redundant_features(acc_feature_corr_matrix, acc_target_corr)
gyro_corr_redundant_features = get_corr_redundant_features(gyro_feature_corr_matrix, gyro_target_corr)

In [25]:
print(len(acc_corr_redundant_features))
acc_corr_redundant_features

92


['CorssCorrelationSum_VTML',
 'CorssCorrelationSum_VTAP',
 'CorssCorrelationLag_VTML',
 'CorssCorrelationSum_MLAP',
 'CorssCorrelationLag_VTAP',
 'DominantFreq_CorrVTAP',
 'AmplitudeNorm_ML',
 'SkewnessPSD_VT',
 'DominantFreq_AP',
 'AmplitudeNorm_AP',
 'MeanPower_VT',
 'MeanPower_AP',
 'MedianPower_ML',
 'IntegratedPower_VT',
 'HarmonicRatio_VT',
 'DominantFreq_VT',
 'RMS_SigComplete_VT',
 'RMSratio_SigComplete_VT',
 'SkewnessPSD_ML',
 'Range_VT',
 'FirstDerivativeMax_SigComplete_VT',
 'Median_SigComplete_ML',
 'FirstDerivative_Moment2nd_SigComplete_VT',
 'RMS_SigComplete_ML',
 'FirstDerivative_Moment3rd_SigComplete_ML',
 'CorssCorrelationPeak_MLAP',
 'Range_SigComplete_AP',
 'Range_SigComplete_ML',
 'RMS_SigComplete_AP',
 'FirstDerivativeRange_SigComplete_AP',
 'FirstDerivativeMax_SigComplete_Magnitude',
 'FirstDerivative_Moment2nd_SigComplete_Magnitude',
 'FirstDerivative_Skweness_SigComplete_Magnitude',
 'FirstDerivativeRMS_SigComplete_ML',
 'FirstDerivativeMin_SigComplete_Magnitude

In [24]:
print(len(gyro_corr_redundant_features))
gyro_corr_redundant_features

93


['CorssCorrelationPeak_VTAP',
 'CorssCorrelationLag_MLAP',
 'Slope_ML',
 'CorssCorrelationPeak_MLAP',
 'Amplitude_ML',
 'DominantFreq_VT',
 'MomentPSD_3rd_ML',
 'SkewnessPSD_VT',
 'SkewnessPSD_ML',
 'KurtosisPSD_VT',
 'MeanPower_ML',
 'MeanPower_AP',
 'WidthNorm_ML',
 'SlopeNorm_ML',
 'IntegratedPower_VT',
 'SumPSD_ML',
 'Range_AP',
 'IndexHarmonicity_ML',
 'HarmonicRatio_VT',
 'HarmonicRatio_ML',
 'IndexHarmonicity_AP',
 'Range_VT',
 'FirstDerivative_Moment3rd_SigComplete_VT',
 'FirstDerivativeRange_SigComplete_VT',
 'Range_SigComplete_VT',
 'Range_ML',
 'RMS_SigComplete_ML',
 'FirstDerivativeMax_SigComplete_ML',
 'FirstDerivative_Moment3rd_SigComplete_ML',
 'FirstDerivative_Skweness_SigComplete_ML',
 'FirstDerivative_Kurtosis_SigComplete_AP',
 'FirstDerivativeRatio_SigComplete_AP',
 'FirstDerivativeRMS_SigComplete_AP',
 'RMS_SigComplete_Magnitude',
 'FirstDerivative_Skweness_SigComplete_Magnitude',
 'FirstDerivativeMin_SigComplete_Magnitude',
 'RMS_SigComplete_VT',
 'Median_SigComple

In [26]:
filtered_acc_dataset_df = filtered_acc_dataset_df.drop(columns=acc_corr_redundant_features)
filtered_gyro_dataset_df = filtered_gyro_dataset_df.drop(columns=gyro_corr_redundant_features)

In [27]:
filtered_acc_dataset_df

Unnamed: 0,Class,IndexHarmonicity_ML,FirstDerivative_Moment3rd_SigComplete_Magnitude,InterQuartileRange_SigComplete_VT,Std_SigComplete_ML,RelativePower_Below5Hz_AP,Moment2ndPSD_Plom_AP,ModeSignal_AP,RelativePower_BelowDomFr_Magnitude,CentroidSpMax_Magnitude
0,0,-1.06933,0.135301,-0.453811,-0.429353,-0.505171,-0.381535,1.425815,-0.676916,-0.092303
1,1,1.681837,1.559217,-0.610753,-0.770064,-0.370701,0.916358,0.790065,0.52294,2.227281
2,0,1.672111,-0.523202,-0.222828,-0.511502,-0.611616,-0.456534,-0.486181,-0.862825,-0.108542
3,1,1.007142,-0.508966,1.372247,-0.479112,-0.00143,-0.451724,-0.956278,0.246958,1.621542
4,0,0.252562,-0.022803,-0.961819,-0.610377,0.919677,-0.451914,0.40579,-0.747132,-0.473631
5,1,-0.906996,0.133564,-0.727572,0.265277,-1.297334,3.07422,-1.794684,-0.479493,-0.277505
6,0,0.808179,-1.688993,-0.694287,0.869764,1.397099,-0.453804,0.156805,-0.893267,-1.897663
7,1,-0.871964,-0.27325,2.507555,-0.696239,-1.279988,-0.087912,-0.757398,0.611405,0.088414
8,0,-0.267925,0.590969,-0.915989,-0.63412,1.787401,-0.454205,-0.637895,-0.781816,-0.526397
9,1,-0.984467,0.252868,-0.179156,-0.832741,-0.573928,-0.451296,1.935459,2.50043,0.002039


In [28]:
filtered_gyro_dataset_df

Unnamed: 0,Class,DominantFreq_ML,HarmonicRatio_AP,Range_SigComplete_AP,RelativePower_BelowDomFr_VT,RelativePower_5To10Hz_ML,ZeroCrossingRate_ML,Moment2ndPSD_Plom_AP,RelativePower_Below5Hz_Magnitude,ModeSignal_Magnitude
0,0,-0.996089,-1.680615,-0.045507,-1.172448,-0.442416,-0.819791,-0.898808,0.964929,1.140569
1,1,0.932871,2.111637,-0.07022,-0.49128,0.123974,0.610931,-1.296793,-0.132561,-0.176969
2,0,1.459405,0.086035,0.233395,-0.59279,1.577567,-0.467831,0.847334,-0.592845,-0.386439
3,1,-1.038511,-0.392838,-0.166474,0.381015,0.544102,0.125207,-0.586626,-1.111037,0.063733
4,0,0.196722,0.171946,0.081826,0.192401,0.318902,0.755438,2.406502,0.941986,0.772013
5,1,-0.207536,-0.109666,0.802191,0.796013,-0.068864,0.429733,-0.018349,-0.79366,-0.065989
6,0,-0.786473,-0.163787,-0.361324,-0.602623,-0.978997,-0.964728,-0.235443,1.998529,-2.156054
7,1,1.110046,0.671282,-1.196484,0.816377,-0.280795,2.649201,-0.469892,-0.502118,-0.943992
8,0,1.439442,-1.786345,2.7487,-1.029947,-0.728192,-1.028477,0.715047,0.978372,0.527311
9,1,-1.175759,0.452083,-0.431377,0.763694,-1.228274,-0.948617,-1.19453,-1.648869,1.828185


### Saving to CSV

In [29]:
filtered_acc_dataset_df.to_csv(final_dataset_path + 'PatchTable_Acceleration_Filtered.csv', index=False)
filtered_gyro_dataset_df.to_csv(final_dataset_path + 'PatchTable_Gyroscope_Filtered.csv', index=False)