# Terrain Classification - Combined User Data
### Created by Keenan McConkey 2019.5.25

In [1]:
from __future__ import absolute_import, division, print_function

import pandas as pd

import glob
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
from mpl_toolkits.mplot3d import Axes3D
from scipy import signal
from scipy import stats

from datetime import datetime
from decimal import Decimal

import pymrmr
import sklearn

## Part 1 - Importing Preprocessed Data

### Part (a) - Functions for Data Import

In [2]:
# Easier to read column names - Convention is different for older datasets
frame_columns = ['X Accel', 'Y Accel', 'Z Accel', 'X Gyro', 'Y Gyro', 'Z Gyro', 'Epoch Time']
std_columns =   ['X Accel', 'Y Accel', 'Z Accel', 'X Gyro', 'Y Gyro', 'Z Gyro', 'Run Time', 'Epoch Time']
data_columns =  ['X Accel', 'Y Accel', 'Z Accel', 'X Gyro', 'Y Gyro', 'Z Gyro']

'''Get columns for given label'''
def get_columns(_label):
    columns = []
    
    # New naming scheme
    if 'Phone' in _label or 'Wheel' in _label or 'Module' in _label:
        columns = std_columns.copy()
    # Old frame data naming scheme
    elif 'Frame' in _label:
        columns = frame_columns.copy()
    else:
        raise Exception('Unknown label')
    
    # For transformed datasets replace time columns with frequency
    if 'FFT' in _label or 'PSD' in _label:
        columns.remove('Epoch Time')
        
        if 'Run Time' in columns:
            columns.remove('Run Time')
        
        columns.append('Frequency')
        
    return columns

In [3]:
# Types of terrains, placements, and transforms used
terrains = ['Concrete', 'Carpet', 'Linoleum', 'Asphalt', 'Sidewalk', 'Grass', 'Gravel']

### Part (b) - Import Processed Data from Each User

In [4]:
'''Combine data from labelled datasets into a single dataframe'''
def combine_datasets(datasets):
    return pd.concat(list(datasets.values()), ignore_index=True)

In [8]:
placements = ['Middle', 'Left', 'Right']
vectors = ['Features', 'FFTs', 'PSDLogs']
users = ['Keenan', 'Kevin', 'Mahsa']
path = 'processed_data/upgraded_manual/' 

# Nested dictionary of processed data
## Placement
### Feature Vector
#### User
placement_dict = {}
    
for placement in placements:
    vector_dict = {}

    for vector in vectors:
        user_dict = {}
        
        for user in users:
            # Get relative file location
            
            filename = placement + '_' + vector + '_Filt_' + user + '.csv' 

            # Read data and update current user dictionary
            data = pd.read_csv(path + filename)
            user_dict.update({user: data})
        
        # Combine users to form a new entry of user dictionary, save to .csv
        combined_data = combine_datasets(user_dict)
        combined_data.to_csv(path + placement + '_' + vector + '_Filt_Combined.csv')
        user_dict.update({'Combined': combined_data})
        
        vector_dict.update({vector: user_dict})
    
    placement_dict.update({placement: vector_dict})

In [33]:
# Check some data
placement_dict['Middle']['Features']['Combined'].tail()

Unnamed: 0,Label,Mean X Accel Middle,Std Dev X Accel Middle,L2 Norm X Accel Middle,Autocorrelation X Accel Middle,Max X Accel Middle,Min X Accel Middle,Root Mean Squared X Accel Middle,Zero Crossing Rate X Accel Middle,Skew X Accel Middle,...,Min Z Gyro Middle,Root Mean Squared Z Gyro Middle,Zero Crossing Rate Z Gyro Middle,Skew Z Gyro Middle,Excess Kurtosis Z Gyro Middle,Mean Square Frequency Z Gyro Middle,Root Mean Square Frequency Z Gyro Middle,Frequency Center Z Gyro Middle,Variance Frequency Z Gyro Middle,Root Variance Frequency Z Gyro Middle
6102,6,-0.846848,-1.15541,-1.193282,-0.984408,-1.190593,0.845964,-1.193282,-0.570367,-0.888599,...,0.114058,-1.084251,1.092294,-0.832074,-0.062854,0.203685,0.440092,-0.025424,0.216404,0.440092
6103,6,-0.02159,-1.231574,-1.345895,-1.0563,-1.020336,1.245787,-1.345895,-1.410363,0.555471,...,0.5063,-1.337267,0.815931,-0.048076,0.020898,0.556325,0.771402,0.168145,0.10239,0.771402
6104,6,0.211125,-2.132865,-2.201046,-1.319494,-1.354912,1.690773,-2.201046,-1.877027,0.938994,...,0.878401,-1.662305,1.645022,1.371618,3.755749,-0.837086,-1.028651,2.378528,-2.276596,-1.028651
6105,6,0.315237,-1.605279,-1.677792,-1.18659,-1.113657,1.517311,-1.677792,-0.570367,0.190928,...,0.751412,-1.604374,2.750476,-0.025628,0.712065,-0.466675,-0.356587,4.341315,-6.044193,-0.356587
6106,6,0.489914,-2.138868,-2.090389,-1.298786,-1.400876,1.797356,-2.090389,-2.157025,0.97892,...,1.023789,-1.77462,1.092294,0.00777,-0.74839,-0.983076,-1.469428,1.968081,-1.685697,-1.469428


## Part 2 - Feature Selection mRMR (minimum Redunancy Maximum Relevance)

Try to find which features are most relevant, from all directions.

Features can be transforms or extracted features.

mRMR tries to find which features have the highest correlation to classified state and lowest correlation with other variables.

### Part (a) - Middle Frame Placement

In [10]:
pymrmr.mRMR(data=placement_dict['Middle']['Features']['Combined'], method='MID', nfeats=10)

['Variance Frequency X Accel Middle',
 'Excess Kurtosis Z Accel Middle',
 'Variance Frequency X Gyro Middle',
 'Zero Crossing Rate Z Gyro Middle',
 'Variance Frequency Y Gyro Middle',
 'Mean Square Frequency Z Gyro Middle',
 'Mean Y Gyro Middle',
 'Zero Crossing Rate Y Accel Middle',
 'Mean Square Frequency X Gyro Middle',
 'Excess Kurtosis Y Gyro Middle']

In [11]:
pymrmr.mRMR(data=placement_dict['Middle']['FFTs']['Combined'], method='MID', nfeats=10)

['FFT 0.0 Hz Z Gyro Middle',
 'FFT 32.0 Hz Z Accel Middle',
 'FFT 31.0 Hz Z Accel Middle',
 'FFT 30.0 Hz Z Accel Middle',
 'FFT 0.0 Hz Y Accel Middle',
 'FFT 33.0 Hz Z Accel Middle',
 'FFT 29.0 Hz Z Accel Middle',
 'FFT 34.0 Hz Z Accel Middle',
 'FFT 35.0 Hz Z Accel Middle',
 'FFT 28.0 Hz Z Accel Middle']

In [12]:
pymrmr.mRMR(data=placement_dict['Middle']['PSDLogs']['Combined'], method='MID', nfeats=10)

['PSD 33.0 Hz Y Gyro Middle',
 'PSD 0.0 Hz Y Gyro Middle',
 'PSD 54.0 Hz Y Accel Middle',
 'PSD 2.0 Hz Y Gyro Middle',
 'PSD 59.0 Hz Y Accel Middle',
 'PSD 45.0 Hz Y Accel Middle',
 'PSD 9.0 Hz Y Gyro Middle',
 'PSD 58.0 Hz Y Accel Middle',
 'PSD 1.0 Hz Y Gyro Middle',
 'PSD 38.0 Hz Y Gyro Middle']

### Part (b) - Left Wheel Placement

In [13]:
pymrmr.mRMR(data=placement_dict['Left']['Features']['Combined'], method='MID', nfeats=10)

['Min Y Accel Left',
 'Zero Crossing Rate Y Gyro Left',
 'Min X Accel Left',
 'Max X Accel Left',
 'Max Y Accel Left',
 'Variance Frequency X Gyro Left',
 'Mean Square Frequency X Accel Left',
 'Mean Square Frequency Y Accel Left',
 'Frequency Center Z Accel Left',
 'Zero Crossing Rate Z Gyro Left']

In [14]:
pymrmr.mRMR(data=placement_dict['Left']['FFTs']['Combined'], method='MID', nfeats=10)

['FFT 0.0 Hz Z Gyro Left',
 'FFT 38.0 Hz X Accel Left',
 'FFT 5.0 Hz X Accel Left',
 'FFT 4.0 Hz Y Accel Left',
 'FFT 36.0 Hz X Accel Left',
 'FFT 4.0 Hz X Accel Left',
 'FFT 33.0 Hz X Accel Left',
 'FFT 40.0 Hz X Accel Left',
 'FFT 0.0 Hz Z Accel Left',
 'FFT 1.0 Hz Y Accel Left']

In [15]:
pymrmr.mRMR(data=placement_dict['Left']['PSDLogs']['Combined'], method='MID', nfeats=10)

['PSD 61.0 Hz Z Accel Left',
 'PSD 49.0 Hz Y Gyro Left',
 'PSD 36.0 Hz Z Accel Left',
 'PSD 40.0 Hz Z Accel Left',
 'PSD 55.0 Hz Z Accel Left',
 'PSD 16.0 Hz Z Accel Left',
 'PSD 63.0 Hz Z Accel Left',
 'PSD 25.0 Hz Z Accel Left',
 'PSD 39.0 Hz Z Accel Left',
 'PSD 59.0 Hz Z Accel Left']

### Part (c) - Right Wheel Placement

In [16]:
pymrmr.mRMR(data=placement_dict['Right']['Features']['Combined'], method='MID', nfeats=10)

['Min X Accel Right',
 'Max Y Accel Right',
 'Variance Frequency Z Accel Right',
 'Max X Accel Right',
 'Min Y Accel Right',
 'Zero Crossing Rate Y Gyro Right',
 'Frequency Center Y Accel Right',
 'Autocorrelation X Gyro Right',
 'Mean Square Frequency Y Accel Right',
 'Mean Square Frequency X Accel Right']

In [17]:
pymrmr.mRMR(data=placement_dict['Right']['FFTs']['Combined'], method='MID', nfeats=10)

['FFT 0.0 Hz Z Gyro Right',
 'FFT 53.0 Hz Z Accel Right',
 'FFT 55.0 Hz Z Accel Right',
 'FFT 54.0 Hz Z Accel Right',
 'FFT 52.0 Hz Z Accel Right',
 'FFT 51.0 Hz Z Accel Right',
 'FFT 56.0 Hz Z Accel Right',
 'FFT 0.0 Hz Z Accel Right',
 'FFT 50.0 Hz Z Accel Right',
 'FFT 1.0 Hz Y Accel Right']

In [18]:
pymrmr.mRMR(data=placement_dict['Right']['PSDLogs']['Combined'], method='MID', nfeats=10)

['PSD 63.0 Hz Z Accel Right',
 'PSD 16.0 Hz Z Accel Right',
 'PSD 41.0 Hz Z Accel Right',
 'PSD 54.0 Hz Z Accel Right',
 'PSD 12.0 Hz Z Accel Right',
 'PSD 61.0 Hz Z Accel Right',
 'PSD 46.0 Hz X Accel Right',
 'PSD 25.0 Hz Z Accel Right',
 'PSD 62.0 Hz Z Accel Right',
 'PSD 47.0 Hz Y Accel Right']

## Part 8 - Training Classifiers

In [34]:
from sklearn.model_selection import KFold

'''Run train test k-fold times
   Returns predicted labels for each K Fold Test'''

def train_test_k_fold(combined_data, n_splits, model):
    kf = KFold(n_splits=n_splits, shuffle=True)

    # Copy data
    data = combined_data.copy()
    print(data.head())
    # Extract terrain labels
    labels = data.pop('Label')

    # Array of predicted labels for each k fold
    predict_k_fold = []
    test_k_fold = []

    # Split into n splits
    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_labels, test_labels = labels.loc[train_index], labels.loc[test_index]

        # Train and test model
        model.fit(train, train_labels)
        predict_k_fold.append(model.predict(test))
        test_k_fold.append(test_labels)

    
    return (test_k_fold, predict_k_fold)

### Part (a) - Create Accuracy Table

In [30]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

# Dictionary of classifiers
classifiers = {'Naive Bayes': GaussianNB(),
               'k Nearest': KNeighborsClassifier(),
               'Decision Tree': DecisionTreeClassifier(), 
               'Random Forest': RandomForestClassifier(n_estimators=100),
               'AdaBoost': AdaBoostClassifier(),
               'Support Vector Machine': SVC(gamma='scale')}

In [31]:
def create_accuracy_table(n_splits, user_name='Combined'):
    # Dataframe table of accuracies for each classifier for each placement
    vector_indices = [placement + ' ' + vector for placement in placements for vector in vectors]
    accuracy_table = pd.DataFrame({'Vector': vector_indices})

    # Calculate accuracy for each placement for each feature vector and classifier
    for classifier_name, classifier in classifiers.items():
        model = classifier

        # Row dictionary for given model
        rows = {}

        # Add current axis and classifier to row dictionary
        for placement in placements:
            for vector in vectors:
                index_name = placement + ' ' + vector

                # Extract predicted and actual labels for requested user
                actual, predict = train_test_k_fold(placement_dict[placement][vector][user_name], 
                                                    n_splits, model)

                # Take mean accuracy of k fold testing
                accuracies = []
                for i in range(len(predict)):
                    accuracies.append(accuracy_score(actual[i], predict[i]))
                rows.update({index_name: np.mean(accuracies)})

        # Update accuracy table with classifier column by mapping row names to indices
        accuracy_table[classifier_name] = accuracy_table['Vector'].map(rows)
    return accuracy_table

In [36]:
# Create accuracy table for 5 splits
accuracy_table = create_accuracy_table(5)

In [37]:
accuracy_table

Unnamed: 0,Vector,Naive Bayes,k Nearest,Decision Tree,Random Forest,AdaBoost,Support Vector Machine
0,Middle Features,0.120346,0.507122,0.772391,0.749795,0.474536,0.675294
1,Middle FFTs,0.507616,0.649911,0.615031,0.791551,0.555754,0.75454
2,Middle PSDLogs,0.559362,0.703291,0.577205,0.749143,0.556413,0.798756
3,Left Features,0.113758,0.411806,0.889732,0.906717,0.277721,0.560441
4,Left FFTs,0.4796,0.443956,0.506144,0.684214,0.515089,0.605794
5,Left PSDLogs,0.478541,0.506593,0.485663,0.66692,0.488851,0.714392
6,Right Features,0.108211,0.40542,0.929852,0.924747,0.549084,0.572846
7,Right FFTs,0.480532,0.446703,0.499778,0.653785,0.52224,0.598074
8,Right PSDLogs,0.46478,0.51874,0.484906,0.643723,0.508969,0.673619


## Part 10 - Combining Feature Selection with Classifiers

### Part (a) - Compare Top Features to Classification Accuracy

In [93]:
def subset_top(top_features, n_top):
    subset_top_features = {}
    
    for placement, features in top_features.items():
        n_features = features[0:n_top - 1]
        # Add label to ensure it remains with the data
        if 'Label' not in n_features:
            n_features.append('Label')
        
        subset_top_features.update({placement: n_features})
        
    return subset_top_features

In [94]:
def train_n_feats(combined, top_features):
    n_feats_arr = np.arange(5, len(top_features['Middle']), 5)
    accuracies = {'Middle': []}

    # Train and test for each number of top features
    for n_feat in n_feats_arr:
        
        combined_top = top_features_only(combined, subset_top(top_features, n_feat))
        # Get k fold predict and actual labels for each vector
        feature = train_test_k_fold(combined_top, 5, model)
        
        # Compare accuracies vs top features
        for placement in test_feat_top[1].keys():
            # Extract predict and actual
            predict, actual = feature[placement]
            
            # Take mean accuracy of k fold testing
            accuracy_k_fold = []
            
            for i in range(len(predict)):
                accuracy_k_fold.append(accuracy_score(actual[i], predict[i]))
            
            accuracies[placement].append(np.mean(accuracy_k_fold))

    return n_feats_arr, accuracies

## Part 11 - Classification on Single Axes

### Part (a) - Separate Combined Data

In [107]:
'''Get placement dictionary of combined data truncated to only include columns matching given tag'''
def get_matching_columns(combined_data, column_match):
    # New placement dictionary with only matching columns
    combined_matching_data = {}
    
    for placement, placement_data in combined_data.items():
        # Retrieve column names that match the given match parameter
        matching_columns = [column for column in placement_data.columns if column_match in column]
        matching_columns.append('Label')
        combined_matching_data.update({placement: placement_data[matching_columns]})
        
    return combined_matching_data

In [108]:
'''Separate combined dataframe into dictionary of axes columns'''
def separate_combined(combined_data):
    separated_data = {}
    
    for axes_column in data_columns:
        separated_data.update({axes_column: get_matching_columns(combined_data, axes_column)})
        
    return separated_data

In [109]:
feat_separated = separate_combined(feat_combined)
fft_separated = separate_combined(fft_combined)
psd_log_separated = separate_combined(psd_log_combined)

feat_separated['Z Accel']['Middle'].head()

Unnamed: 0,Mean Z Accel Middle,Std Dev Z Accel Middle,L2 Norm Z Accel Middle,Autocorrelation Z Accel Middle,Max Z Accel Middle,Min Z Accel Middle,Root Mean Squared Z Accel Middle,Zero Crossing Rate Z Accel Middle,Skew Z Accel Middle,Excess Kurtosis Z Accel Middle,Mean Square Frequency Z Accel Middle,Root Mean Square Frequency Z Accel Middle,Frequency Center Z Accel Middle,Variance Frequency Z Accel Middle,Root Variance Frequency Z Accel Middle,Label
0,0.122339,-1.008275,-1.009104,-0.899187,-0.681546,1.023153,-1.009104,1.566721,1.780103,0.401485,-0.456228,-0.401738,1.075185,-1.19281,-0.401738,6
1,-0.182195,-1.578249,-1.578728,-1.199407,-1.867886,1.367937,-1.578728,-0.639562,-1.483517,-0.442405,-0.926447,-1.397661,-1.69377,0.937022,-1.397661,6
2,-0.776866,-0.696951,-0.696603,-0.695432,-0.402445,-0.086405,-0.696603,-0.394419,1.154003,1.526448,-0.52961,-0.519713,0.086255,-0.333001,-0.519713,6
3,1.484097,1.939577,1.939385,2.12427,1.426864,-1.482568,1.939385,-0.639562,-0.640592,-0.656278,1.882451,1.869241,0.749348,0.366038,1.869241,6
4,-0.402704,2.691306,2.690768,3.288577,2.007963,-2.05358,2.690768,0.831293,-0.214282,-0.508332,5.626296,3.975964,0.494108,2.610364,3.975964,6


### Part (b) - Compute Accuracy Table for Single Axes

In [110]:
def create_axes_accuracy_table(n_splits):
    # Dataframe table of accuracies for each classifier for each placement
    vector_indices = [vector + ' ' + placement + ' ' + axis for axis in data_columns for placement in placements for vector in vector_names]
    axes_accuracy_table = pd.DataFrame({'Vector': vector_indices})

    # Calculate accuracy for each axes of each placement for each feature vector and classifier
    for classifier_name, classifier in classifiers.items():
        model = classifier

        # Row dictionary for given model
        rows = {}

        for axis in data_columns:
            # Get k fold predict and actual labels for each vector for current axis
            feat  = train_test_k_fold(feat_separated[axis], n_splits, model)
            fft = train_test_k_fold(fft_separated[axis], n_splits, model)
            psd_log = train_test_k_fold(psd_log_separated[axis], n_splits, model)
            vectors = (feat, fft, psd_log)

            # Add current axis and classifier to row dictionary
            for i, vector_name in enumerate(vector_names):
                for placement in placements:
                    index_name = vector_name + ' ' + placement + ' ' + axis

                    # Extract predicted and actual labels
                    predict, actual = vectors[i][placement]

                    # Take mean accuracy of k fold testing
                    accuracies = []
                    for j in range(len(predict)):
                        accuracies.append(accuracy_score(actual[j], predict[j]))
                    rows.update({index_name: np.mean(accuracies)})

            # Update accuracy table with classifier column by mapping row names to indices
            axes_accuracy_table[classifier_name] = axes_accuracy_table['Vector'].map(rows)
    
    return axes_accuracy_table

In [111]:
axes_accuracy_table = create_axes_accuracy_table(5)

In [112]:
axes_accuracy_table

Unnamed: 0,Vector,Naive Bayes,k Nearest,Decision Tree,Random Forest,AdaBoost,Support Vector Machine
0,ExtractedFeatures Middle X Accel,0.109859,0.244608,0.271815,0.340092,0.239223,0.296711
1,FFT Middle X Accel,0.413498,0.377312,0.366271,0.507962,0.380393,0.515144
2,PSDLog Middle X Accel,0.42428,0.320846,0.365246,0.508733,0.3845,0.501281
3,ExtractedFeatures Middle Y Accel,0.128843,0.340345,0.389114,0.500774,0.301848,0.399894
4,FFT Middle Y Accel,0.50745,0.490753,0.428638,0.601383,0.371913,0.601134
5,PSDLog Middle Y Accel,0.512829,0.469967,0.413241,0.589837,0.388606,0.594454
6,ExtractedFeatures Middle Z Accel,0.114987,0.256411,0.279777,0.399636,0.252305,0.319043
7,FFT Middle Z Accel,0.517455,0.522072,0.45765,0.613958,0.403233,0.594713
8,PSDLog Middle Z Accel,0.517968,0.480749,0.438403,0.591898,0.391178,0.587786
9,ExtractedFeatures Middle X Gyro,0.112937,0.327518,0.290043,0.43018,0.261294,0.377825


### Glossary

`Dataset` - Batch of data recorded on one terrain type

`Data Window` - Split up portion of a `Dataset`

`Direction / Axes` - Linear acceleration or gyroscope in $x,y$ or $z$

`Feature Vector` - Any feature of the data that can be used to classify terrain, e.g. Z Accel Mean, Y Accel FFT, etc

`Extracted Feature Vector` - Features that aren't from transforms, e.g. Z Accel Min, Y Accel Autocorrelation, etc

`Placement` - One of three IMU placements on the wheelchair, i.e. Middle, Left, or Right