# Terrain Classification - Combined User Data
### Created by Keenan McConkey 2019.08.01

In [1]:
from __future__ import absolute_import, division, print_function

import pandas as pd

import glob
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
from mpl_toolkits.mplot3d import Axes3D
from scipy import signal
from scipy import stats

from datetime import datetime
from decimal import Decimal

import pymrmr
import sklearn

## Part 1 - Importing Preprocessed Data

### Part (a) - Functions for Data Import

In [11]:
# All the terrains, placements, vectors, power-assistance, users in the study
terrains = ['Concrete', 'Carpet', 'Linoleum', 'Asphalt', 'Sidewalk', 'Grass', 'Gravel']
powers = ['Manual', 'Power']
placements_manual = ['Middle', 'Left', 'Right', 'Synthesis']
placements_power = ['Middle']
vectors = ['Features', 'FFTs', 'PSDLogs']
users = ['Keenan', 'Kevin', 'Mahsa']

### Part (b) - Import Processed Data from Each User

In [12]:
'''Combine data from labelled datasets into a single dataframe'''
def combine_datasets(datasets):
    return pd.concat(list(datasets.values()), ignore_index=True, sort=True)

In [13]:
path = 'processed_data/new_setup/' 

# Nested dictionary of processed data:
# - Power assistance type
# -- Placement
# --- Feature Vector
# ---- User
#power_dict = {}
    
for power in powers:
    placement_dict = {}
    
    if power == 'Power':
        placements = placements_power.copy()
    else:
        placements = placements_manual.copy()
    
    for placement in placements:
        vector_dict = {}

        for vector in vectors:
            user_dict = {}

            for user in users:
                # File name based on above parameters
                filename = power.lower() + '/' + placement + '_' + vector + '_Filt_' + user 
                if power == 'Power':
                    filename += '_Power'
                filename += '.csv'
                
                # Read data and update current user dictionary
                data = pd.read_csv(path + filename)
                user_dict.update({user: data})

            # Combine users to form a new entry of user dictionary, save to .csv
            # NaNs carry over from issues in original data
            combined_data = combine_datasets(user_dict).dropna(axis='columns')
            combined_data.to_csv(path + placement + '_' + vector + '_Filt_All.csv', index=False)
            user_dict.update({'All': combined_data})

            vector_dict.update({vector: user_dict})
        
        # Create a new combined feature vector column
        users_combined_vector = {}
        
        for user in users:
            user_all_vectors = [vector[user] for vector in vector_dict.values()]
            users_combined_vector.update({user: pd.concat(user_all_vectors, axis='columns')})
        
        vector_dict.update({'Combined': users_combined_vector})
        
        placement_dict.update({placement: vector_dict})
    
    power_dict.update({power: placement_dict})

In [15]:
# Check some data
power_dict['Power']['Middle']['Features']['Mahsa'].tail()

Unnamed: 0,Label,Mean X Accel Middle,Std Dev X Accel Middle,L2 Norm X Accel Middle,Autocorrelation X Accel Middle,Max X Accel Middle,Min X Accel Middle,Root Mean Squared X Accel Middle,Zero Crossing Rate X Accel Middle,Skew X Accel Middle,...,Min Z Gyro Middle,Root Mean Squared Z Gyro Middle,Zero Crossing Rate Z Gyro Middle,Skew Z Gyro Middle,Excess Kurtosis Z Gyro Middle,Mean Square Frequency Z Gyro Middle,Root Mean Square Frequency Z Gyro Middle,Frequency Center Z Gyro Middle,Variance Frequency Z Gyro Middle,Root Variance Frequency Z Gyro Middle
2164,3,-1.126572,0.424742,0.387467,0.178925,0.62326,0.235609,0.387467,-0.061161,1.214306,...,0.921098,0.069563,-0.127,0.089422,-0.983848,-0.288748,-0.296445,-0.338694,0.341499,-0.296445
2165,3,-0.588483,-0.136424,-0.19896,-0.318336,-0.059482,0.193594,-0.19896,-0.774704,0.757073,...,0.840151,-0.219364,-0.127,1.127261,1.014379,1.094327,1.715763,1.340716,-0.744721,1.715763
2166,3,-1.490103,-0.666548,-0.624511,-0.609363,-0.479324,0.513408,-0.624511,-1.060121,1.178308,...,0.736654,-1.018714,-0.127,-0.572848,-0.752103,-0.440181,-0.983096,-0.432959,0.364952,-0.983096
2167,3,0.19178,-0.576278,-0.616766,-0.604591,-0.555568,0.422258,-0.616766,-1.773664,-0.70867,...,0.623496,-1.372115,-0.127,1.286244,-0.244252,-0.391205,-0.669259,-0.71803,0.411741,-0.669259
2168,3,-0.513519,-1.360531,-1.417987,-0.995189,-1.120245,1.003544,-1.417987,0.652382,-0.429179,...,0.512163,-1.628647,-0.127,1.243623,-0.526091,-0.119714,0.11531,-0.238542,0.312216,0.11531


In [16]:
# Check some more data
power_dict['Manual']['Synthesis']['PSDLogs']['All'].tail()

Unnamed: 0,Label,PSD 0.0 Hz Calc X Vel Synthesis,PSD 0.0 Hz Calc Z Gyro Synthesis,PSD 0.0 Hz Left X Accel Synthesis,PSD 0.0 Hz Left X Gyro Synthesis,PSD 0.0 Hz Left XY Accel Synthesis,PSD 0.0 Hz Left Y Accel Synthesis,PSD 0.0 Hz Left Y Gyro Synthesis,PSD 0.0 Hz Left Z Accel Synthesis,PSD 0.0 Hz Left Z Gyro Synthesis,...,PSD 9.0 Hz Left Y Gyro Synthesis,PSD 9.0 Hz Left Z Accel Synthesis,PSD 9.0 Hz Left Z Gyro Synthesis,PSD 9.0 Hz Right X Accel Synthesis,PSD 9.0 Hz Right X Gyro Synthesis,PSD 9.0 Hz Right XY Accel Synthesis,PSD 9.0 Hz Right Y Accel Synthesis,PSD 9.0 Hz Right Y Gyro Synthesis,PSD 9.0 Hz Right Z Accel Synthesis,PSD 9.0 Hz Right Z Gyro Synthesis
6893,4,-3.264952,-1.57149,1.652339,-0.386797,-0.835998,1.135152,-0.74908,-0.414393,-1.339272,...,-4.166539,-1.317774,-3.8728,-1.143138,-3.565039,-2.09229,-2.037026,-3.672597,-2.107728,-2.999258
6894,4,-1.495158,-0.999469,0.601438,-1.195582,-1.104471,0.04165,-1.090774,-0.846955,-0.112306,...,-3.887024,-3.48581,-2.233419,-2.921208,-5.514269,-3.295627,-4.697588,-3.423919,-3.103458,-3.749833
6895,4,-3.542684,-2.832347,-2.496859,-3.868736,-3.384072,-5.362448,-2.796001,-4.009233,-2.080771,...,-4.613258,-3.215537,-3.93745,-1.912095,-6.133095,-4.566719,-4.25593,-3.958968,-3.373509,-5.069013
6896,4,-2.720504,-2.549193,-0.816001,-3.569763,-3.203245,-1.43203,-2.328293,-3.220571,-1.429644,...,-4.101137,-2.700848,-3.793125,-1.232665,-5.151142,-3.142856,-3.264628,-3.841875,-2.577016,-4.081245
6897,4,-4.991653,-3.296635,-1.892388,-3.347691,-3.435017,-2.227256,-3.890363,-2.541126,-3.268783,...,-4.012663,-2.341277,-4.175299,-1.95301,-6.21911,-2.836621,-3.186752,-4.585258,-4.199869,-5.481877


In [17]:
# Check even more data
power_dict['Manual']['Left']['Combined']['Keenan'].tail()

Unnamed: 0,Label,Mean X Accel Left,Std Dev X Accel Left,L2 Norm X Accel Left,Autocorrelation X Accel Left,Max X Accel Left,Min X Accel Left,Root Mean Squared X Accel Left,Zero Crossing Rate X Accel Left,Skew X Accel Left,...,PSD 1.0 Hz X Gyro Left,PSD 1.0 Hz Z Accel Left,PSD 1.0 Hz Y Accel Left,PSD 1.0 Hz X Accel Left,PSD 0.0 Hz Z Gyro Left,PSD 0.0 Hz Y Gyro Left,PSD 0.0 Hz X Gyro Left,PSD 0.0 Hz Z Accel Left,PSD 0.0 Hz Y Accel Left,PSD 0.0 Hz X Accel Left
2295,0,-1.026906,-0.82467,-2.309737,-2.021861,-1.796506,-0.147242,-2.309737,1.070393,0.004097,...,-2.335416,-1.37447,0.781048,-0.295132,-0.017801,-1.936974,-2.456735,-1.512415,1.706674,1.390008
2296,0,0.377827,0.45765,0.33305,0.277991,-0.091166,-0.221493,0.33305,-0.754909,-1.104432,...,-2.252106,-1.466722,0.30829,0.704295,-0.227063,-2.411583,-2.533022,-1.951089,1.570336,1.619446
2297,0,-0.347453,0.487954,0.004273,-0.047167,0.140666,-0.353502,0.004273,-0.146475,0.145485,...,-2.649791,-1.350961,0.166676,-0.331784,-0.57383,-4.578342,-2.003783,-0.948404,1.660901,1.679368
2298,0,-0.556549,0.49315,0.031598,-0.020565,-0.060578,-0.410998,0.031598,-0.146475,0.767939,...,-2.700846,-1.30385,-0.299755,-0.226737,-1.14117,-3.038069,-2.50689,-0.559873,1.653538,1.685454
2299,0,-0.56531,0.238233,-0.485607,-0.511073,0.015808,-0.17373,-0.485607,-0.146475,0.796493,...,-2.908061,-1.833999,0.114618,0.177328,-0.84676,-3.112945,-3.108316,-0.641099,1.6801,1.622272


## Part 2 - Feature Selection mRMR (minimum Redunancy Maximum Relevance)

Try to find which features are most relevant, from all directions.

Features can be transforms or extracted features.

mRMR tries to find which features have the highest correlation to classified state and lowest correlation with other variables.

### Part (a) - Middle Frame Placement

In [None]:
pymrmr.mRMR(data=placement_dict['Middle']['Features']['Combined'], method='MID', nfeats=10)

In [None]:
pymrmr.mRMR(data=placement_dict['Middle']['FFTs']['Combined'], method='MID', nfeats=10)

In [None]:
pymrmr.mRMR(data=placement_dict['Middle']['PSDLogs']['Combined'], method='MID', nfeats=10)

### Part (b) - Left Wheel Placement

In [None]:
pymrmr.mRMR(data=placement_dict['Left']['Features']['Combined'], method='MID', nfeats=10)

In [None]:
pymrmr.mRMR(data=placement_dict['Left']['FFTs']['Combined'], method='MID', nfeats=10)

In [None]:
pymrmr.mRMR(data=placement_dict['Left']['PSDLogs']['Combined'], method='MID', nfeats=10)

### Part (c) - Right Wheel Placement

In [None]:
pymrmr.mRMR(data=placement_dict['Right']['Features']['Combined'], method='MID', nfeats=10)

In [None]:
pymrmr.mRMR(data=placement_dict['Right']['FFTs']['Combined'], method='MID', nfeats=10)

In [None]:
pymrmr.mRMR(data=placement_dict['Right']['PSDLogs']['Combined'], method='MID', nfeats=10)

### Part (d) - Synthesis "Placement"

In [None]:
pymrmr.mRMR(data=placement_dict['Synthesis']['Features']['Combined'], method='MID', nfeats=10)

In [None]:
pymrmr.mRMR(data=placement_dict['Synthesis']['FFTs']['Combined'], method='MID', nfeats=10)

In [None]:
pymrmr.mRMR(data=placement_dict['Synthesis']['PSDLogs']['Combined'], method='MID', nfeats=10)

## Part 3 - Combining Data from Each Placement

## Part 4 - Training Classifiers

In [None]:
from sklearn.model_selection import KFold

'''Run train test k-fold times
   Returns predicted labels for each K Fold Test'''

def train_test_k_fold(combined_data, n_splits, model):
    kf = KFold(n_splits=n_splits, shuffle=True)

    # Copy data
    data = combined_data.copy()
    
    # Extract terrain labels
    labels = data.pop('Label')

    # Array of predicted labels for each k fold
    predict_k_fold = []
    test_k_fold = []

    # Split into n splits
    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_labels, test_labels = labels.loc[train_index], labels.loc[test_index]

        # Train and test model
        model.fit(train, train_labels)
        predict_k_fold.append(model.predict(test))
        test_k_fold.append(test_labels)

    
    return (test_k_fold, predict_k_fold)

### Part (a) - Create Accuracy Table

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Dictionary of classifiers
classifiers = {'Naive Bayes': GaussianNB(),
               'k Nearest': KNeighborsClassifier(),
               'Decision Tree': DecisionTreeClassifier(), 
               'Random Forest': RandomForestClassifier(n_estimators=100),
               'AdaBoost': AdaBoostClassifier(),
               'Support Vector Machine': SVC(gamma='scale')}

In [None]:
def create_accuracy_table(n_splits, user_name='Combined'):
    # Dataframe table of accuracies for each classifier for each placement
    vector_indices = [placement + ' ' + vector for placement in placements for vector in vectors]
    accuracy_table = pd.DataFrame({'Vector': vector_indices})

    # Calculate accuracy for each placement for each feature vector and classifier
    for classifier_name, classifier in classifiers.items():
        model = classifier

        # Row dictionary for given model
        rows = {}

        # Add current axis and classifier to row dictionary
        for placement in placements:
            for vector in vectors:
                index_name = placement + ' ' + vector

                # Extract predicted and actual labels for requested user
                actual, predict = train_test_k_fold(placement_dict[placement][vector][user_name], 
                                                    n_splits, model)

                # Take mean accuracy of k fold testing
                accuracies = []
                for i in range(len(predict)):
                    accuracies.append(accuracy_score(actual[i], predict[i]))
                rows.update({index_name: np.mean(accuracies)})

        # Update accuracy table with classifier column by mapping row names to indices
        accuracy_table[classifier_name] = accuracy_table['Vector'].map(rows)
    return accuracy_table

In [None]:
# Create accuracy table for 5 splits
accuracy_table = create_accuracy_table(5)

In [None]:
accuracy_table

## Part 10 - Combining Feature Selection with Classifiers

### Part (a) - Compare Top Features to Classification Accuracy

In [None]:
def subset_top(top_features, n_top):
    subset_top_features = {}
    
    for placement, features in top_features.items():
        n_features = features[0:n_top - 1]
        # Add label to ensure it remains with the data
        if 'Label' not in n_features:
            n_features.append('Label')
        
        subset_top_features.update({placement: n_features})
        
    return subset_top_features

In [None]:
def train_n_feats(combined, top_features):
    n_feats_arr = np.arange(5, len(top_features['Middle']), 5)
    accuracies = {'Middle': []}

    # Train and test for each number of top features
    for n_feat in n_feats_arr:
        
        combined_top = top_features_only(combined, subset_top(top_features, n_feat))
        # Get k fold predict and actual labels for each vector
        feature = train_test_k_fold(combined_top, 5, model)
        
        # Compare accuracies vs top features
        for placement in test_feat_top[1].keys():
            # Extract predict and actual
            predict, actual = feature[placement]
            
            # Take mean accuracy of k fold testing
            accuracy_k_fold = []
            
            for i in range(len(predict)):
                accuracy_k_fold.append(accuracy_score(actual[i], predict[i]))
            
            accuracies[placement].append(np.mean(accuracy_k_fold))

    return n_feats_arr, accuracies

## Part 11 - Classification on Single Axes

### Part (a) - Separate Combined Data

In [None]:
'''Get placement dictionary of combined data truncated to only include columns matching given tag'''
def get_matching_columns(combined_data, column_match):
    # New placement dictionary with only matching columns
    combined_matching_data = {}
    
    for placement, placement_data in combined_data.items():
        # Retrieve column names that match the given match parameter
        matching_columns = [column for column in placement_data.columns if column_match in column]
        matching_columns.append('Label')
        combined_matching_data.update({placement: placement_data[matching_columns]})
        
    return combined_matching_data

In [None]:
'''Separate combined dataframe into dictionary of axes columns'''
def separate_combined(combined_data):
    separated_data = {}
    
    for axes_column in data_columns:
        separated_data.update({axes_column: get_matching_columns(combined_data, axes_column)})
        
    return separated_data

In [None]:
feat_separated = separate_combined(feat_combined)
fft_separated = separate_combined(fft_combined)
psd_log_separated = separate_combined(psd_log_combined)

feat_separated['Z Accel']['Middle'].head()

### Part (b) - Compute Accuracy Table for Single Axes

In [None]:
def create_axes_accuracy_table(n_splits):
    # Dataframe table of accuracies for each classifier for each placement
    vector_indices = [vector + ' ' + placement + ' ' + axis for axis in data_columns for placement in placements for vector in vector_names]
    axes_accuracy_table = pd.DataFrame({'Vector': vector_indices})

    # Calculate accuracy for each axes of each placement for each feature vector and classifier
    for classifier_name, classifier in classifiers.items():
        model = classifier

        # Row dictionary for given model
        rows = {}

        for axis in data_columns:
            # Get k fold predict and actual labels for each vector for current axis
            feat  = train_test_k_fold(feat_separated[axis], n_splits, model)
            fft = train_test_k_fold(fft_separated[axis], n_splits, model)
            psd_log = train_test_k_fold(psd_log_separated[axis], n_splits, model)
            vectors = (feat, fft, psd_log)

            # Add current axis and classifier to row dictionary
            for i, vector_name in enumerate(vector_names):
                for placement in placements:
                    index_name = vector_name + ' ' + placement + ' ' + axis

                    # Extract predicted and actual labels
                    predict, actual = vectors[i][placement]

                    # Take mean accuracy of k fold testing
                    accuracies = []
                    for j in range(len(predict)):
                        accuracies.append(accuracy_score(actual[j], predict[j]))
                    rows.update({index_name: np.mean(accuracies)})

            # Update accuracy table with classifier column by mapping row names to indices
            axes_accuracy_table[classifier_name] = axes_accuracy_table['Vector'].map(rows)
    
    return axes_accuracy_table

In [None]:
axes_accuracy_table = create_axes_accuracy_table(5)

In [None]:
axes_accuracy_table

### Glossary

`Dataset` - Batch of data recorded on one terrain type

`Data Window` - Split up portion of a `Dataset`

`Direction / Axes` - Linear acceleration or gyroscope in $x,y$ or $z$

`Feature Vector` - Any feature of the data that can be used to classify terrain, e.g. Z Accel Mean, Y Accel FFT, etc

`Extracted Feature Vector` - Features that aren't from transforms, e.g. Z Accel Min, Y Accel Autocorrelation, etc

`Placement` - One of three IMU placements on the wheelchair, i.e. Middle, Left, or Right