# Terrain Classification - Combined User Data
### Created by Keenan McConkey 2019.08.01

In [1]:
from __future__ import absolute_import, division, print_function

import pandas as pd

import glob
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
from mpl_toolkits.mplot3d import Axes3D
from scipy import signal
from scipy import stats

from datetime import datetime
from decimal import Decimal

import pymrmr
import sklearn

## Part 1 - Importing Preprocessed Data

### Part (a) - Functions for Data Import

In [2]:
# All the terrains, placements, vectors, power-assistance, users in the study
terrains = ['Concrete', 'Carpet', 'Linoleum', 'Asphalt', 'Sidewalk', 'Grass', 'Gravel']
powers = ['Manual'] # TODO: Fix power PSD data and add back in
placements_manual = ['Middle', 'Left', 'Right', 'Synthesis']
placements_power = ['Middle']
vectors = ['Features', 'FFTs', 'PSDLogs']
users = ['Keenan', 'Kevin', 'Mahsa', 'Jamie']

### Part (b) - Import Processed Data from Each User

In [3]:
'''Combine data from labelled datasets into a single dataframe'''
def combine_datasets(datasets):
    return pd.concat(list(datasets.values()), ignore_index=True, sort=False)

In [4]:
path = 'processed_data/new_setup/' 

# Nested dictionary of processed data:
# - Power assistance type
# -- Placement
# --- Feature Vector
# ---- User
power_dict = {}

# Create each nesting of the dictionary
for power in powers:
    placement_dict = {}
    
    # Power datasets only have middle placement (for now)
    if power == 'Power':
        placements = placements_power.copy()
    else:
        placements = placements_manual.copy()
    
    for placement in placements:
        vector_dict = {}

        for vector in vectors:
            user_dict = {}

            for user in users:
                # File name based on above parameters
                filename = power.lower() + '/' + placement + '_' + vector + '_Filt_' + user 
                if power == 'Power':
                    filename += '_Power'
                filename += '.csv'
                
                # Read data and update current user dictionary
                data = pd.read_csv(path + filename)
                user_dict.update({user: data})

            # Combine users to form a new entry of user dictionary, save to .csv
            # NaNs arise when you combine Synthesis feature vectors
            combined_data = combine_datasets(user_dict).dropna(axis='columns')
            user_dict.update({'All': combined_data})

            vector_dict.update({vector: user_dict})
        
        # Create a dictionary of the combined feature vector for each user
        combined_vector_user_dict = {}
        
        for user in user_dict.keys():
            # Get all vectors for current user and pop label column
            user_all_vectors = []
            
            for vector in vector_dict.values():
                user_vector = vector[user].copy()
                labels = user_vector.pop('Label') # All label columns should be the same
                user_all_vectors.append(user_vector)
            
            # Combine vectors and add back label column
            combined_vector = pd.concat(user_all_vectors, axis='columns')
            combined_vector.insert(loc=0, column='Label', value=labels)
            combined_vector_user_dict.update({user: combined_vector})
        
        # Add the combined feature vector to the vector dictionary
        vector_dict.update({'Combined': combined_vector_user_dict})
        
        placement_dict.update({placement: vector_dict})
    
    power_dict.update({power: placement_dict})

In [5]:
# Check some data
power_dict['Manual']['Middle']['Combined']['All'].tail()

Unnamed: 0,Label,Mean X Accel Middle,Std Dev X Accel Middle,L2 Norm X Accel Middle,Autocorrelation X Accel Middle,Max X Accel Middle,Min X Accel Middle,Root Mean Squared X Accel Middle,Zero Crossing Rate X Accel Middle,Skew X Accel Middle,...,PSD 1.0 Hz X Gyro Middle,PSD 1.0 Hz Z Accel Middle,PSD 1.0 Hz Y Accel Middle,PSD 1.0 Hz X Accel Middle,PSD 0.0 Hz Z Gyro Middle,PSD 0.0 Hz Y Gyro Middle,PSD 0.0 Hz X Gyro Middle,PSD 0.0 Hz Z Accel Middle,PSD 0.0 Hz Y Accel Middle,PSD 0.0 Hz X Accel Middle
8012,5,-0.048488,-0.146781,-0.224684,-0.29349,-0.369834,0.803903,-0.224684,0.410072,-0.295411,...,-3.199153,-2.119317,-1.360919,-1.72694,-1.125372,-2.414256,-3.104088,-1.992795,-0.906233,-0.163503
8013,5,-0.838211,-0.54981,-0.679571,-0.697007,-0.727631,0.649344,-0.679571,-1.279923,0.891646,...,-2.854475,-1.476822,-1.801745,-0.448959,-1.699782,-2.961492,-2.498364,-1.379586,-1.488251,0.043019
8014,5,-0.05851,-1.523523,-1.527153,-1.358674,-0.612425,0.753509,-1.527153,1.25507,0.353343,...,-2.651156,-2.436379,-1.318068,-1.271694,-1.236619,-2.188757,-2.50294,-1.90818,-1.126076,-0.427966
8015,5,0.068267,1.145593,1.019167,0.982532,2.396203,-0.86399,1.019167,-0.434925,1.624021,...,-3.348061,-2.11386,-2.132868,-0.466117,-1.621375,-2.014946,-3.37316,-2.914507,-1.887899,0.042225
8016,5,-0.180351,-1.198178,-1.241871,-1.149074,-0.756011,1.430402,-1.241871,-0.857424,-0.327254,...,-3.855377,-2.770625,-2.242923,-1.473848,-1.418449,-2.783864,-2.749813,-2.456584,-0.994285,-0.041851


In [6]:
# Check some data
power_dict['Power']['Middle']['FFTs']['All'].tail()

KeyError: 'Power'

## Part 2 - Feature Selection mRMR (minimum Redunancy Maximum Relevance)

Try to find which features are most relevant, from all directions.

Features can be transforms or extracted features.

mRMR tries to find which features have the highest correlation to classified state and lowest correlation with other variables.

### Part (a) - Middle Frame Placement

#### Part (i) - Manual Wheelchair

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Middle']['Features']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Middle']['FFTs']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Middle']['PSDLogs']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Middle']['Combined']['All'], method='MID', nfeats=5)

#### Part (i) - Power Assist Wheelchair

In [None]:
pymrmr.mRMR(data=power_dict['Power']['Middle']['Features']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Power']['Middle']['FFTs']['All'], method='MID', nfeats=5)

In [None]:
#pymrmr.mRMR(data=power_dict['Power']['Middle']['PSDLogs']['All'], method='MID', nfeats=5)

In [None]:
#pymrmr.mRMR(data=power_dict['Power']['Middle']['Combined']['All'], method='MID', nfeats=5)

### Part (b) - Left Wheel Placement

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Left']['Features']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Left']['FFTs']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Left']['PSDLogs']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Left']['Combined']['All'], method='MID', nfeats=5)

### Part (c) - Right Wheel Placement

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Right']['Features']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Right']['FFTs']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Right']['PSDLogs']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Right']['Combined']['All'], method='MID', nfeats=5)

### Part (d) - Synthesis "Placement"

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Synthesis']['Features']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Synthesis']['FFTs']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Synthesis']['PSDLogs']['All'], method='MID', nfeats=5)

In [None]:
pymrmr.mRMR(data=power_dict['Manual']['Synthesis']['Combined']['All'], method='MID', nfeats=5)

## Part 3 - Combining Data from Each Placement

## Part 4 - Training Classifiers

In [None]:
from sklearn.model_selection import KFold

'''Run train test k-fold times
   Returns predicted labels for each K Fold Test'''

def train_test_k_fold(combined_data, n_splits, model):
    kf = KFold(n_splits=n_splits, shuffle=True)

    # Copy data
    data = combined_data.copy()
    
    # Extract terrain labels
    labels = data.pop('Label')

    # Array of predicted labels for each k fold
    predict_k_fold = []
    test_k_fold = []

    # Split into n splits
    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_labels, test_labels = labels.loc[train_index], labels.loc[test_index]

        # Train and test model
        model.fit(train, train_labels)
        predict_k_fold.append(model.predict(test))
        test_k_fold.append(test_labels)

    
    return (test_k_fold, predict_k_fold)

### Part (a) - Create Accuracy Table

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Dictionary of classifiers
classifiers = {'Naive Bayes': GaussianNB(),
               'k Nearest': KNeighborsClassifier(),
               'Decision Tree': DecisionTreeClassifier(), 
               'Random Forest': RandomForestClassifier(n_estimators=100),
               'AdaBoost': AdaBoostClassifier(),
               'Support Vector Machine': SVC(gamma='scale')}

In [None]:
def create_accuracy_table(n_splits, power_type='Manual', user_name='All'):
    # Dataframe table of accuracies for each classifier for each placement
    vector_indices = [placement + ' ' + vector for placement in placements for vector in vectors]
    accuracy_table = pd.DataFrame({'Vector': vector_indices})

    # Calculate accuracy for each placement for each feature vector and classifier
    for classifier_name, classifier in classifiers.items():
        model = classifier

        # Row dictionary for given model
        rows = {}

        # Add current axis and classifier to row dictionary
        for placement in placements:
            for vector in vectors:
                index_name = placement + ' ' + vector

                # Extract predicted and actual labels for requested user
                actual, predict = train_test_k_fold(placement_dict[power_type][placement][vector][user_name], 
                                                    n_splits, model)

                # Take mean accuracy of k fold testing
                accuracies = []
                for i in range(len(predict)):
                    accuracies.append(accuracy_score(actual[i], predict[i]))
                rows.update({index_name: np.mean(accuracies)})

        # Update accuracy table with classifier column by mapping row names to indices
        accuracy_table[classifier_name] = accuracy_table['Vector'].map(rows)
    return accuracy_table

In [None]:
# Create accuracy table for 5 splits
accuracy_table = create_accuracy_table(5)

In [None]:
accuracy_table

## Part 10 - Combining Feature Selection with Classifiers

### Part (a) - Compare Top Features to Classification Accuracy

In [None]:
def subset_top(top_features, n_top):
    subset_top_features = {}
    
    for placement, features in top_features.items():
        n_features = features[0:n_top - 1]
        # Add label to ensure it remains with the data
        if 'Label' not in n_features:
            n_features.append('Label')
        
        subset_top_features.update({placement: n_features})
        
    return subset_top_features

In [None]:
def train_n_feats(combined, top_features):
    n_feats_arr = np.arange(5, len(top_features['Middle']), 5)
    accuracies = {'Middle': []}

    # Train and test for each number of top features
    for n_feat in n_feats_arr:
        
        combined_top = top_features_only(combined, subset_top(top_features, n_feat))
        # Get k fold predict and actual labels for each vector
        feature = train_test_k_fold(combined_top, 5, model)
        
        # Compare accuracies vs top features
        for placement in test_feat_top[1].keys():
            # Extract predict and actual
            predict, actual = feature[placement]
            
            # Take mean accuracy of k fold testing
            accuracy_k_fold = []
            
            for i in range(len(predict)):
                accuracy_k_fold.append(accuracy_score(actual[i], predict[i]))
            
            accuracies[placement].append(np.mean(accuracy_k_fold))

    return n_feats_arr, accuracies

## Part 11 - Classification on Single Axes

### Part (a) - Separate Combined Data

In [None]:
'''Get placement dictionary of combined data truncated to only include columns matching given tag'''
def get_matching_columns(combined_data, column_match):
    # New placement dictionary with only matching columns
    combined_matching_data = {}
    
    for placement, placement_data in combined_data.items():
        # Retrieve column names that match the given match parameter
        matching_columns = [column for column in placement_data.columns if column_match in column]
        matching_columns.append('Label')
        combined_matching_data.update({placement: placement_data[matching_columns]})
        
    return combined_matching_data

In [None]:
'''Separate combined dataframe into dictionary of axes columns'''
def separate_combined(combined_data):
    separated_data = {}
    
    for axes_column in data_columns:
        separated_data.update({axes_column: get_matching_columns(combined_data, axes_column)})
        
    return separated_data

In [None]:
feat_separated = separate_combined(feat_combined)
fft_separated = separate_combined(fft_combined)
psd_log_separated = separate_combined(psd_log_combined)

feat_separated['Z Accel']['Middle'].head()

### Part (b) - Compute Accuracy Table for Single Axes

In [None]:
def create_axes_accuracy_table(n_splits):
    # Dataframe table of accuracies for each classifier for each placement
    vector_indices = [vector + ' ' + placement + ' ' + axis for axis in data_columns for placement in placements for vector in vector_names]
    axes_accuracy_table = pd.DataFrame({'Vector': vector_indices})

    # Calculate accuracy for each axes of each placement for each feature vector and classifier
    for classifier_name, classifier in classifiers.items():
        model = classifier

        # Row dictionary for given model
        rows = {}

        for axis in data_columns:
            # Get k fold predict and actual labels for each vector for current axis
            feat  = train_test_k_fold(feat_separated[axis], n_splits, model)
            fft = train_test_k_fold(fft_separated[axis], n_splits, model)
            psd_log = train_test_k_fold(psd_log_separated[axis], n_splits, model)
            vectors = (feat, fft, psd_log)

            # Add current axis and classifier to row dictionary
            for i, vector_name in enumerate(vector_names):
                for placement in placements:
                    index_name = vector_name + ' ' + placement + ' ' + axis

                    # Extract predicted and actual labels
                    predict, actual = vectors[i][placement]

                    # Take mean accuracy of k fold testing
                    accuracies = []
                    for j in range(len(predict)):
                        accuracies.append(accuracy_score(actual[j], predict[j]))
                    rows.update({index_name: np.mean(accuracies)})

            # Update accuracy table with classifier column by mapping row names to indices
            axes_accuracy_table[classifier_name] = axes_accuracy_table['Vector'].map(rows)
    
    return axes_accuracy_table

In [None]:
axes_accuracy_table = create_axes_accuracy_table(5)

In [None]:
axes_accuracy_table

### Glossary

`Dataset` - Batch of data recorded on one terrain type

`Data Window` - Split up portion of a `Dataset`

`Direction / Axes` - Linear acceleration or gyroscope in $x,y$ or $z$

`Feature Vector` - Any feature of the data that can be used to classify terrain, e.g. Z Accel Mean, Y Accel FFT, etc

`Extracted Feature Vector` - Features that aren't from transforms, e.g. Z Accel Min, Y Accel Autocorrelation, etc

`Placement` - One of three IMU placements on the wheelchair, i.e. Middle, Left, or Right