Gesture Recognition thru Machine Learning
========================
** Machine learning algorithms applied to glove sensor values to predict gestures. **

Consisting of two parts:
1. PCA Visualization
2. RBF SVM vs Random Forest

In [32]:
%pylab inline
#Essentials
import pandas as pd
import numpy as np
import string
import os

#Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split

#Libraries for the classifiers
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

#Utilities
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem
from sklearn import metrics
from sklearn.grid_search import GridSearchCV

#Model persistence
from sklearn.externals import joblib

#Visualization
import seaborn as sb

from scipy import stats

Populating the interactive namespace from numpy and matplotlib


In [232]:
# Directories
dir_base = "prediction/"

dir_training = dir_base + "data/"
dir_scaler = dir_base + "scaler/"
dir_plots = dir_base + "plots/"
dir_classifiers = dir_base + "classifiers/"

# File names
consolidated_fn = "consolidated.csv"
scaler_fn = "scaler.pkl"

# Cleaning of xlsx data
def data_cleaning(file):
    feature_names = ['fThumb', 'fIndex', 'fMiddle', 'fRing', 'fPinky', 'c1', 'c2', 'c3', 'c4', 'aX', 'aY', 'aZ', 'gX', 'gY', 'gZ', 'label']
    f = pd.read_excel(file, convert_float=True, names=feature_names)
    f.dropna(how='any',inplace=True)
    f.to_csv(file.split(".")[0] + ".csv", index=False)

# Place label at the end
def rearrange_cols(df):
    cols = df.columns.tolist()
    cols = cols[1:] + cols[:1]
    return df[cols]

# Cleaning and consolidating of training data
def data_consolidate():
    for fn in os.listdir(dir_training):
        if fn.endswith('.xlsx'):
            data_cleaning(dir_training + fn)

    data = pd.concat([pd.read_csv(dir_training + fn).groupby('label').median().reset_index() for fn in os.listdir(dir_training) if fn.endswith('.csv')])
    data.reset_index(drop=True, inplace=True)
    data = rearrange_cols(data).sort_values(by='label')
    data.to_csv(dir_training+consolidated_fn, index=False)
    return data

def get_top_five(df):
    var_df_flex = df.drop(labels=c_cols+a_cols+g_cols, axis=1)
    var_df_flex = var_df_flex.groupby(by="label").std().reset_index().mean(axis=1).sort_values(ascending=False)
    print(var_df_flex)
    top_five_varied_letters = [target_names[i] for i in var_df_flex[:5].index]
    return top_five_varied_letters


## Visualization

### Principal Component Analysis (PCA) 
Identifies the combination of attributes (principal components, or directions in the feature space) that account for the most variance in the data. Here we plot the different samples on the 2 first principal components.

In [185]:
def plot_scatter(X_estimator, letter, title, directory):
    plt.figure()
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(12, 8)
    for i in range(len(target_names)):
        px = X_estimator[y == i, 0]
        py = X_estimator[y == i, 1]
        if i is letter :
            plt.scatter(px, py, c='red', zorder=2)
        else:
            plt.scatter(px, py, c='silver', alpha=0.5, zorder=1)
        plt.legend(target_names)
    plt.title("PCA 2 Components plot for letter %s" % target_names[letter])
    plt.xlabel("First Component")
    plt.ylabel("Second Component")
    plt.savefig("%s%s.png" % (directory, letter))
 

## Classifiers

### SVM (Support Vector Machine)
Obtains hyperplanes, used on separating instances of one class from the rest, in an optimal way by selecting the ones that pass through the widest possible gaps between instances of different classes. New instances will be classified depending on which side of the surface they fall on.

### Random Forest
Based on bagging, bootstrap aggregation, technique that constructs multitude of randomly trained decision trees in classifying. This is done by obtaining random data subsets from the original dataset and creating decision trees with these subsets. Once the decision trees are constructed, mode of the classifications made by the decision trees will be obtained and treated as its prediction.


In [259]:
def classify(X_train, y_train, X_test, y_test, folds, directory):
    #Machine learning algo variables
    names = [
        "RBF SVM",
        "Random Forest"
         ]
    classifiers = [
        SVC(),
        RandomForestClassifier(),
        ]

    parameters = [
        {'kernel': ['rbf','linear'], 'gamma': [1e-3, 1e-4, 2], 'C': [1, 10, 100, 1000]},
        {'max_depth': [3, 7, 9, 11, 25], 'n_estimators': [50, 75, 100, 130, 150]},
    ]

    clf_predictor = []
    for name, clf, parameter in zip(names, classifiers, parameters):
        print("\n################ %s ################" % name)
        grid_search = GridSearchCV(clf, parameter, cv = folds)
        grid_search.fit(X_train, y_train)

        print("Best parameters set found on development set: \n %s" % str(grid_search.best_params_))
        print("Grid scores on development set: \n")
        for params, mean_score, scores in grid_search.grid_scores_:
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean_score, scores.std() * 2, params))

        print("The model is trained on the full development set. \n The scores are computed on the full evaluation set. \n")
        y_true, y_pred = y_test, grid_search.predict(X_test)
        print(metrics.classification_report(y_true, y_pred))

        clf_predictor.append(grid_search)

    #Should be in the predictor folder
    for clf in clf_predictor:
        path = "%s%s%s.pkl" % (dir_classifiers, directory, str(clf.estimator).split("(")[0])
        joblib.dump(clf, path)
        print("%s : dumped!" % str(clf.estimator).split("(")[0])

# Main

Contains the run/main function

In [257]:
#Data Parameters
target_names = [i for i in string.ascii_uppercase]

#file_dir = "sdf/"
def run(df, directory, cv):
    df.to_csv(dir_training + directory + "data.csv")
    
    sl_data = df.iloc[:,:-1]
    sl_target = df['label'].apply(lambda x: target_names.index(x)).values

    # Data contains the observations, target contains the classifications
    X, y = sl_data, sl_target

    #Standardize X because of different scaling from different sensors
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X) 
    print("***** Data loaded and standardized *****")
  
    # Saving of scaler
    joblib.dump(scaler, dir_scaler+directory+scaler_fn)
    print("***** Scaler dumped *****")
    
    ############# PCA ############# 
    """
    n = df.columns.size - 1
    pca = PCA(n_components=n)
    print("***** Plotting PCA *****")

    X_pca = pca.fit(X_std).transform(X_std)
    for i in range(26):
        plot_scatter(X_pca, i, "PCA of the dataset", dir_plots + directory)

    print('Explained variance ratio (first %s components): %s' % (str(n), str(pca.explained_variance_ratio_)))
    """
    ############# Classifiers ############# 
    # Assignment of test and training data
    X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.25, random_state=0)
    print("***** Modeling start *****")
    
    classify(X_train, y_train, X_test, y_test, cv, directory)
    

In [269]:
df = pd.read_csv(dir_training+consolidated_fn)
cv = 5 # n of folds

fs_cols = ['fThumb', 'fIndex', 'fMiddle', 'fRing', 'fPinky']
c_cols = ['c1', 'c2', 'c3', 'c4']
a_cols = ['aX', 'aY', 'aZ']
g_cols = ['gX', 'gY', 'gZ']


# df.drop(labels=c_cols+a_cols+g_cols, axis=1, inplace=True)
# run(df, "flex_only/", cv)

In [300]:
svm = joblib.load(dir_classifiers+"all/SVC.pkl")
scaler = joblib.load(dir_scaler+"all/scaler.pkl")

svm.predict(scaler.transform(np.array(df.iloc[99:100,:-1])))



array([14])

In [None]:
def evaluate_cross_validation(clf, K):
    # create a k-fold croos validation iterator
    cv = KFold(len(y_train), K, shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X_train, y_train, cv=cv)
    print(scores)
    print(("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores)))
    
def fit_clf(clf):
    clf.fit(X_train, y_train)
    return clf

def train_and_evaluate(clf):
    clf.fit(X_train, y_train)
    print("Accuracy on training set:")
    print(clf.score(X_train, y_train))
    print("Accuracy on testing set:")
    print(clf.score(X_test, y_test))
    y_pred = clf.predict(X_test)
    print("Classification Report:")
    print(metrics.classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(metrics.confusion_matrix(y_test, y_pred))
    return clf
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2

# X_KBest = SelectKBest(chi2, k=10).fit_transform(X, y)