# Terrain Classification - Combined User Data
### Created by Keenan McConkey 2019.5.25

In [1]:
from __future__ import absolute_import, division, print_function

import pandas as pd

import glob
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
from mpl_toolkits.mplot3d import Axes3D
from scipy import signal
from scipy import stats

from datetime import datetime
from decimal import Decimal

import pymrmr
import sklearn

## Part 1 - Importing Preprocessed Data

### Part (a) - Functions for Data Import

In [2]:
# Easier to read column names
std_columns = ['X Accel', 'Y Accel', 'Z Accel', 'X Gyro', 'Y Gyro', 'Z Gyro', 'Run Time', 'Epoch Time']
data_columns =  ['X Accel', 'Y Accel', 'Z Accel', 'X Gyro', 'Y Gyro', 'Z Gyro']
synthesis_columns = ['Calc X Accel', 'Calc X Vel', 'Calc Z Gyro', 'Run Time', 'Epoch Time']

'''Get columns for given label'''
def get_columns(_label):
    
    # Columns are different for synthesis data
    if 'Middle' in _label or 'Left' in _label or 'Right' in _label:
        columns = std_columns.copy()
    elif 'Synthesis' in _label:
        columns = synthesis_columns.copy()
    else:
        raise Exception('Unknown label')
    
    # For transformed datasets replace time columns with frequency
    if 'FFT' in _label or 'PSD' in _label:
        columns.remove('Epoch Time')
        
        if 'Run Time' in columns:
            columns.remove('Run Time')
        
        columns.append('Frequency')
        
    return columns

In [3]:
# Types of terrains, placements, and transforms used
terrains = ['Concrete', 'Carpet', 'Linoleum', 'Asphalt', 'Sidewalk', 'Grass', 'Gravel']

### Part (b) - Import Processed Data from Each User

In [4]:
'''Combine data from labelled datasets into a single dataframe'''
def combine_datasets(datasets):
    return pd.concat(list(datasets.values()), ignore_index=True, sort=True)

In [5]:
placements = ['Middle', 'Left', 'Right', 'Synthesis']
vectors = ['Features', 'FFTs', 'PSDLogs']
users = ['Keenan', 'Kevin', 'Mahsa']
path = 'processed_data/upgraded_manual/' 

# Nested dictionary of processed data
## Placement
### Feature Vector
#### User
placement_dict = {}
    
for placement in placements:
    vector_dict = {}

    for vector in vectors:
        user_dict = {}
        
        for user in users:
            # Get relative file location
            
            filename = placement + '_' + vector + '_Filt_' + user + '.csv' 

            # Read data and update current user dictionary
            data = pd.read_csv(path + filename)
            user_dict.update({user: data})
        
        # Combine users to form a new entry of user dictionary, save to .csv
        # NaNs carry over from issues in original data
        combined_data = combine_datasets(user_dict).dropna(axis='columns')
        combined_data = combined_data 
        combined_data.to_csv(path + placement + '_' + vector + '_Filt_Combined.csv', index=False)
        user_dict.update({'Combined': combined_data})
        
        vector_dict.update({vector: user_dict})
    
    placement_dict.update({placement: vector_dict})

In [6]:
# Check some data
placement_dict['Synthesis']['Features']['Combined'].tail()

Unnamed: 0,Autocorrelation Calc X Vel Synthesis,Autocorrelation Calc Z Gyro Synthesis,Excess Kurtosis Calc X Vel Synthesis,Excess Kurtosis Calc Z Gyro Synthesis,Frequency Center Calc X Vel Synthesis,Frequency Center Calc Z Gyro Synthesis,L2 Norm Calc X Vel Synthesis,L2 Norm Calc Z Gyro Synthesis,Label,Max Calc X Vel Synthesis,...,Root Variance Frequency Calc X Vel Synthesis,Root Variance Frequency Calc Z Gyro Synthesis,Skew Calc X Vel Synthesis,Skew Calc Z Gyro Synthesis,Std Dev Calc X Vel Synthesis,Std Dev Calc Z Gyro Synthesis,Variance Frequency Calc X Vel Synthesis,Variance Frequency Calc Z Gyro Synthesis,Zero Crossing Rate Calc X Vel Synthesis,Zero Crossing Rate Calc Z Gyro Synthesis
6893,0.554015,0.408804,1.052426,-0.365399,3.042858,0.745413,0.503205,0.679275,4,0.21129,...,0.025851,-0.549667,-1.555756,0.872356,-0.765704,-0.10676,-4.127901,-0.518106,-0.205036,-0.223452
6894,-2.159373,-0.559781,0.04049,-0.800244,0.664605,-0.382037,-1.942073,-0.655184,4,-0.299933,...,3.304603,0.596189,1.692548,-0.625821,2.917684,1.865722,-0.329394,0.503558,0.725512,0.552058
6895,-2.686132,-1.003065,3.745778,3.106931,1.185065,0.718053,-3.413512,-2.153352,4,-3.619477,...,0.09428,-0.876296,-3.451242,3.296178,-0.836251,-1.514057,-0.927175,-0.486877,-0.205036,-0.223452
6896,-2.668497,-0.972741,0.381692,-0.264521,0.100096,1.429155,-3.24898,-1.878283,4,-3.170527,...,0.370808,-0.061541,1.175944,-0.629877,-0.32207,-1.115715,0.169504,-1.409185,1.656061,2.87859
6897,-2.692987,-1.007679,0.117849,-0.007054,2.881813,1.998173,-3.554982,-2.235792,4,-3.469993,...,-0.402992,-1.150548,0.087653,0.261922,-1.174978,-1.77681,-3.784344,-2.308493,7.239351,6.756143


## Part 2 - Feature Selection mRMR (minimum Redunancy Maximum Relevance)

Try to find which features are most relevant, from all directions.

Features can be transforms or extracted features.

mRMR tries to find which features have the highest correlation to classified state and lowest correlation with other variables.

### Part (a) - Middle Frame Placement

In [7]:
pymrmr.mRMR(data=placement_dict['Middle']['Features']['Combined'], method='MID', nfeats=10)

['Root Mean Squared X Accel Middle',
 'Autocorrelation Y Gyro Middle',
 'Std Dev X Accel Middle',
 'L2 Norm X Accel Middle',
 'Mean X Accel Middle',
 'Max X Accel Middle',
 'Min X Accel Middle',
 'Mean Square Frequency X Accel Middle',
 'Std Dev X Gyro Middle',
 'Max Z Gyro Middle']

In [8]:
pymrmr.mRMR(data=placement_dict['Middle']['FFTs']['Combined'], method='MID', nfeats=10)

['FFT 0.0 Hz Z Gyro Middle',
 'FFT 38.0 Hz Y Accel Middle',
 'FFT 13.0 Hz X Accel Middle',
 'FFT 15.0 Hz Y Accel Middle',
 'FFT 54.0 Hz Y Accel Middle',
 'FFT 32.0 Hz Z Accel Middle',
 'FFT 12.0 Hz X Accel Middle',
 'FFT 7.0 Hz X Accel Middle',
 'FFT 3.0 Hz X Accel Middle',
 'FFT 32.0 Hz Y Accel Middle']

In [9]:
pymrmr.mRMR(data=placement_dict['Middle']['PSDLogs']['Combined'], method='MID', nfeats=10)

['PSD 33.0 Hz Y Gyro Middle',
 'PSD 0.0 Hz Y Gyro Middle',
 'PSD 54.0 Hz Y Accel Middle',
 'PSD 2.0 Hz Y Gyro Middle',
 'PSD 59.0 Hz Y Accel Middle',
 'PSD 45.0 Hz Y Accel Middle',
 'PSD 9.0 Hz Y Gyro Middle',
 'PSD 58.0 Hz Y Accel Middle',
 'PSD 1.0 Hz Y Gyro Middle',
 'PSD 38.0 Hz Y Gyro Middle']

### Part (b) - Left Wheel Placement

In [10]:
pymrmr.mRMR(data=placement_dict['Left']['Features']['Combined'], method='MID', nfeats=10)

['Root Mean Squared X Accel Left',
 'Mean Y Accel Left',
 'L2 Norm X Accel Left',
 'Autocorrelation Y Accel Left',
 'Max X Accel Left',
 'L2 Norm Y Accel Left',
 'Zero Crossing Rate Y Accel Left',
 'Root Mean Squared Y Accel Left',
 'Autocorrelation Z Gyro Left',
 'Frequency Center Y Accel Left']

In [11]:
pymrmr.mRMR(data=placement_dict['Left']['FFTs']['Combined'], method='MID', nfeats=10)

['FFT 1.0 Hz X Accel Left',
 'FFT 2.0 Hz Y Accel Left',
 'FFT 0.0 Hz X Gyro Left',
 'FFT 2.0 Hz X Accel Left',
 'FFT 0.0 Hz Z Gyro Left',
 'FFT 3.0 Hz Y Accel Left',
 'FFT 3.0 Hz X Accel Left',
 'FFT 0.0 Hz Y Accel Left',
 'FFT 5.0 Hz Y Accel Left',
 'FFT 0.0 Hz Z Accel Left']

In [12]:
pymrmr.mRMR(data=placement_dict['Left']['PSDLogs']['Combined'], method='MID', nfeats=10)

['PSD 61.0 Hz Z Accel Left',
 'PSD 49.0 Hz Y Gyro Left',
 'PSD 36.0 Hz Z Accel Left',
 'PSD 40.0 Hz Z Accel Left',
 'PSD 55.0 Hz Z Accel Left',
 'PSD 16.0 Hz Z Accel Left',
 'PSD 63.0 Hz Z Accel Left',
 'PSD 25.0 Hz Z Accel Left',
 'PSD 39.0 Hz Z Accel Left',
 'PSD 59.0 Hz Z Accel Left']

### Part (c) - Right Wheel Placement

In [13]:
pymrmr.mRMR(data=placement_dict['Right']['Features']['Combined'], method='MID', nfeats=10)

['Root Mean Squared X Accel Right',
 'Autocorrelation Y Accel Right',
 'L2 Norm X Accel Right',
 'Zero Crossing Rate Y Accel Right',
 'Mean X Accel Right',
 'Root Mean Squared Y Accel Right',
 'Std Dev Y Accel Right',
 'Mean Y Accel Right',
 'L2 Norm Y Accel Right',
 'Frequency Center X Accel Right']

In [14]:
pymrmr.mRMR(data=placement_dict['Right']['FFTs']['Combined'], method='MID', nfeats=10)

['FFT 1.0 Hz X Accel Right',
 'FFT 2.0 Hz Y Accel Right',
 'FFT 2.0 Hz X Accel Right',
 'FFT 0.0 Hz X Gyro Right',
 'FFT 0.0 Hz Y Accel Right',
 'FFT 0.0 Hz Z Gyro Right',
 'FFT 3.0 Hz Y Accel Right',
 'FFT 3.0 Hz X Accel Right',
 'FFT 4.0 Hz X Accel Right',
 'FFT 5.0 Hz Y Accel Right']

In [15]:
pymrmr.mRMR(data=placement_dict['Right']['PSDLogs']['Combined'], method='MID', nfeats=10)

['PSD 63.0 Hz Z Accel Right',
 'PSD 16.0 Hz Z Accel Right',
 'PSD 41.0 Hz Z Accel Right',
 'PSD 54.0 Hz Z Accel Right',
 'PSD 12.0 Hz Z Accel Right',
 'PSD 61.0 Hz Z Accel Right',
 'PSD 46.0 Hz X Accel Right',
 'PSD 25.0 Hz Z Accel Right',
 'PSD 62.0 Hz Z Accel Right',
 'PSD 47.0 Hz Y Accel Right']

### Part (d) - Synthesis "Placement"

In [32]:
pymrmr.mRMR(data=placement_dict['Synthesis']['Features']['Combined'], method='MID', nfeats=10)

['Root Mean Squared Calc X Vel Synthesis',
 'Min Calc X Vel Synthesis',
 'Max Calc X Vel Synthesis',
 'Mean Calc X Vel Synthesis',
 'L2 Norm Calc X Vel Synthesis',
 'Zero Crossing Rate Calc Z Gyro Synthesis',
 'Autocorrelation Calc Z Gyro Synthesis',
 'Skew Calc X Vel Synthesis',
 'Zero Crossing Rate Calc X Vel Synthesis',
 'Frequency Center Calc Z Gyro Synthesis']

In [17]:
pymrmr.mRMR(data=placement_dict['Synthesis']['FFTs']['Combined'], method='MID', nfeats=10)

['Label',
 'FFT 0.0 Hz Calc Z Gyro Synthesis',
 'FFT 1.0 Hz Calc Z Gyro Synthesis',
 'FFT 5.0 Hz Calc Z Gyro Synthesis',
 'FFT 47.0 Hz Calc Z Gyro Synthesis',
 'FFT 48.0 Hz Calc X Vel Synthesis',
 'FFT 48.0 Hz Calc Z Gyro Synthesis',
 'FFT 49.0 Hz Calc X Vel Synthesis',
 'FFT 49.0 Hz Calc Z Gyro Synthesis',
 'FFT 46.0 Hz Calc Z Gyro Synthesis']

In [18]:
pymrmr.mRMR(data=placement_dict['Synthesis']['PSDLogs']['Combined'], method='MID', nfeats=10)

['PSD 49.0 Hz Calc Z Gyro Synthesis',
 'PSD 0.0 Hz Calc X Vel Synthesis',
 'PSD 0.0 Hz Calc Z Gyro Synthesis',
 'PSD 51.0 Hz Calc X Vel Synthesis',
 'PSD 3.0 Hz Calc Z Gyro Synthesis',
 'PSD 1.0 Hz Calc X Vel Synthesis',
 'PSD 53.0 Hz Calc Z Gyro Synthesis',
 'PSD 2.0 Hz Calc Z Gyro Synthesis',
 'PSD 2.0 Hz Calc X Vel Synthesis',
 'PSD 1.0 Hz Calc Z Gyro Synthesis']

## Part 3 - Combining Data from Each Placement

## Part 4 - Training Classifiers

In [27]:
from sklearn.model_selection import KFold

'''Run train test k-fold times
   Returns predicted labels for each K Fold Test'''

def train_test_k_fold(combined_data, n_splits, model):
    kf = KFold(n_splits=n_splits, shuffle=True)

    # Copy data
    data = combined_data.copy()
    
    # Extract terrain labels
    labels = data.pop('Label')

    # Array of predicted labels for each k fold
    predict_k_fold = []
    test_k_fold = []

    # Split into n splits
    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_labels, test_labels = labels.loc[train_index], labels.loc[test_index]

        # Train and test model
        model.fit(train, train_labels)
        predict_k_fold.append(model.predict(test))
        test_k_fold.append(test_labels)

    
    return (test_k_fold, predict_k_fold)

### Part (a) - Create Accuracy Table

In [28]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Dictionary of classifiers
classifiers = {'Naive Bayes': GaussianNB(),
               'k Nearest': KNeighborsClassifier(),
               'Decision Tree': DecisionTreeClassifier(), 
               'Random Forest': RandomForestClassifier(n_estimators=100),
               'AdaBoost': AdaBoostClassifier(),
               'Support Vector Machine': SVC(gamma='scale')}

In [29]:
def create_accuracy_table(n_splits, user_name='Combined'):
    # Dataframe table of accuracies for each classifier for each placement
    vector_indices = [placement + ' ' + vector for placement in placements for vector in vectors]
    accuracy_table = pd.DataFrame({'Vector': vector_indices})

    # Calculate accuracy for each placement for each feature vector and classifier
    for classifier_name, classifier in classifiers.items():
        model = classifier

        # Row dictionary for given model
        rows = {}

        # Add current axis and classifier to row dictionary
        for placement in placements:
            for vector in vectors:
                index_name = placement + ' ' + vector

                # Extract predicted and actual labels for requested user
                actual, predict = train_test_k_fold(placement_dict[placement][vector][user_name], 
                                                    n_splits, model)

                # Take mean accuracy of k fold testing
                accuracies = []
                for i in range(len(predict)):
                    accuracies.append(accuracy_score(actual[i], predict[i]))
                rows.update({index_name: np.mean(accuracies)})

        # Update accuracy table with classifier column by mapping row names to indices
        accuracy_table[classifier_name] = accuracy_table['Vector'].map(rows)
    return accuracy_table

In [30]:
# Create accuracy table for 5 splits
accuracy_table = create_accuracy_table(5)

In [31]:
accuracy_table

Unnamed: 0,Vector,Naive Bayes,k Nearest,Decision Tree,Random Forest,AdaBoost,Support Vector Machine
0,Middle Features,0.129525,0.502706,0.774517,0.756831,0.473558,0.678402
1,Middle FFTs,0.506471,0.64778,0.615037,0.787124,0.533974,0.749468
2,Middle PSDLogs,0.560173,0.707221,0.576714,0.756509,0.536432,0.798098
3,Left Features,0.103896,0.414074,0.899592,0.90338,0.316703,0.556349
4,Left FFTs,0.481117,0.442587,0.512205,0.685575,0.512518,0.600789
5,Left PSDLogs,0.476417,0.508121,0.482177,0.672841,0.498712,0.712874
6,Right Features,0.105294,0.404844,0.926787,0.925772,0.524567,0.563218
7,Right FFTs,0.484468,0.443196,0.500799,0.649851,0.515092,0.604494
8,Right PSDLogs,0.464781,0.515381,0.467258,0.640511,0.513344,0.673765
9,Synthesis Features,0.131198,0.25877,0.928385,0.85068,0.317777,0.31676


## Part 10 - Combining Feature Selection with Classifiers

### Part (a) - Compare Top Features to Classification Accuracy

In [None]:
def subset_top(top_features, n_top):
    subset_top_features = {}
    
    for placement, features in top_features.items():
        n_features = features[0:n_top - 1]
        # Add label to ensure it remains with the data
        if 'Label' not in n_features:
            n_features.append('Label')
        
        subset_top_features.update({placement: n_features})
        
    return subset_top_features

In [None]:
def train_n_feats(combined, top_features):
    n_feats_arr = np.arange(5, len(top_features['Middle']), 5)
    accuracies = {'Middle': []}

    # Train and test for each number of top features
    for n_feat in n_feats_arr:
        
        combined_top = top_features_only(combined, subset_top(top_features, n_feat))
        # Get k fold predict and actual labels for each vector
        feature = train_test_k_fold(combined_top, 5, model)
        
        # Compare accuracies vs top features
        for placement in test_feat_top[1].keys():
            # Extract predict and actual
            predict, actual = feature[placement]
            
            # Take mean accuracy of k fold testing
            accuracy_k_fold = []
            
            for i in range(len(predict)):
                accuracy_k_fold.append(accuracy_score(actual[i], predict[i]))
            
            accuracies[placement].append(np.mean(accuracy_k_fold))

    return n_feats_arr, accuracies

## Part 11 - Classification on Single Axes

### Part (a) - Separate Combined Data

In [None]:
'''Get placement dictionary of combined data truncated to only include columns matching given tag'''
def get_matching_columns(combined_data, column_match):
    # New placement dictionary with only matching columns
    combined_matching_data = {}
    
    for placement, placement_data in combined_data.items():
        # Retrieve column names that match the given match parameter
        matching_columns = [column for column in placement_data.columns if column_match in column]
        matching_columns.append('Label')
        combined_matching_data.update({placement: placement_data[matching_columns]})
        
    return combined_matching_data

In [None]:
'''Separate combined dataframe into dictionary of axes columns'''
def separate_combined(combined_data):
    separated_data = {}
    
    for axes_column in data_columns:
        separated_data.update({axes_column: get_matching_columns(combined_data, axes_column)})
        
    return separated_data

In [None]:
feat_separated = separate_combined(feat_combined)
fft_separated = separate_combined(fft_combined)
psd_log_separated = separate_combined(psd_log_combined)

feat_separated['Z Accel']['Middle'].head()

### Part (b) - Compute Accuracy Table for Single Axes

In [None]:
def create_axes_accuracy_table(n_splits):
    # Dataframe table of accuracies for each classifier for each placement
    vector_indices = [vector + ' ' + placement + ' ' + axis for axis in data_columns for placement in placements for vector in vector_names]
    axes_accuracy_table = pd.DataFrame({'Vector': vector_indices})

    # Calculate accuracy for each axes of each placement for each feature vector and classifier
    for classifier_name, classifier in classifiers.items():
        model = classifier

        # Row dictionary for given model
        rows = {}

        for axis in data_columns:
            # Get k fold predict and actual labels for each vector for current axis
            feat  = train_test_k_fold(feat_separated[axis], n_splits, model)
            fft = train_test_k_fold(fft_separated[axis], n_splits, model)
            psd_log = train_test_k_fold(psd_log_separated[axis], n_splits, model)
            vectors = (feat, fft, psd_log)

            # Add current axis and classifier to row dictionary
            for i, vector_name in enumerate(vector_names):
                for placement in placements:
                    index_name = vector_name + ' ' + placement + ' ' + axis

                    # Extract predicted and actual labels
                    predict, actual = vectors[i][placement]

                    # Take mean accuracy of k fold testing
                    accuracies = []
                    for j in range(len(predict)):
                        accuracies.append(accuracy_score(actual[j], predict[j]))
                    rows.update({index_name: np.mean(accuracies)})

            # Update accuracy table with classifier column by mapping row names to indices
            axes_accuracy_table[classifier_name] = axes_accuracy_table['Vector'].map(rows)
    
    return axes_accuracy_table

In [None]:
axes_accuracy_table = create_axes_accuracy_table(5)

In [None]:
axes_accuracy_table

### Glossary

`Dataset` - Batch of data recorded on one terrain type

`Data Window` - Split up portion of a `Dataset`

`Direction / Axes` - Linear acceleration or gyroscope in $x,y$ or $z$

`Feature Vector` - Any feature of the data that can be used to classify terrain, e.g. Z Accel Mean, Y Accel FFT, etc

`Extracted Feature Vector` - Features that aren't from transforms, e.g. Z Accel Min, Y Accel Autocorrelation, etc

`Placement` - One of three IMU placements on the wheelchair, i.e. Middle, Left, or Right