# Training Notebook
Notebook enthält Code zur Modellselektion und Training für die Übungsaufgabe der Vorlesung Kontextsensitive Systeme.

In [16]:
from load_data import load_data_from_influxdb, create_data_windows, split_X_y

import pandas as pd
import numpy as np

from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold, LeaveOneGroupOut, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn_porter import Porter

Sensordaten aus InfluxDB lesen und in pandas dataframe speichern

In [17]:
sensor_data = load_data_from_influxdb()
sensor_data.head()

Unnamed: 0_level_0,accel_magnitude,accelerationIncludingGravityX,accelerationIncludingGravityY,accelerationIncludingGravityZ,accelerationX,accelerationY,accelerationZ,alpha,beta,context,gamma,rotationRateAlpha,rotationRateBeta,rotationRateGamma,subject
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-10-08 05:30:16.062,0.174358,6.022812,-2.550041,5.568802,0.087672,0.063873,-0.100828,92.962407,-22.961549,Laufen,-82.587576,-4.72365,-0.93284,44.887348,Sub1
2019-10-08 05:30:16.162,2.292669,11.118669,-3.663746,-0.70939,0.537493,1.099085,-1.540338,121.915243,-46.603486,Laufen,-56.643401,4.472299,-5.210323,87.627502,Sub1
2019-10-08 05:30:16.263,5.734371,12.82893,-2.489791,-0.180875,1.584237,1.562521,1.260424,263.367118,-160.659473,Laufen,82.775591,-116.616136,39.370595,36.103688,Sub1
2019-10-08 05:30:16.363,3.608428,14.043985,-3.525084,3.092599,1.876998,-0.17404,2.87355,110.310284,-39.489416,Laufen,-58.643106,-5.008587,-2.402769,-33.415544,Sub1
2019-10-08 05:30:16.464,5.535369,16.649476,-6.796758,1.507437,3.448174,-4.156194,0.941564,85.736742,-10.820387,Laufen,-87.982199,73.23347,-36.368736,-89.517006,Sub1


Daten in Fenster von 2 Sekunden aufteilen und einfache Features berechnen (min, max, mean, std)

In [18]:
data_windows = create_data_windows(sensor_data, '2000ms')
data_windows.head()
data_windows.shape

(126, 54)

In [19]:
data_windows_train = data_windows[~data_windows["subject"].isin(["Sub1", "Sub2"])]
data_windows_test = data_windows[data_windows["subject"].isin(["Sub1", "Sub2"])]

Funktion zur Modellevaluation definieren

In [20]:
def print_confusion_matrix(matrix, labels):
    print(" " * 7, end='')
    for label in labels:
        print("{:^10}".format(label), end='')
    print()
    for i, label in enumerate(labels):
        print("{:>7}".format(label), end='')
        for j in range(matrix.shape[1]):
            print("{:^10}".format(matrix[i,j]), end='')
        print()

def evaluate_classifier(X, y, split_indices, classifier, classifier_name):
    accuracy_scores = []
    confusion_matrix_sum = np.zeros((3,3))

    for (train_index, test_index) in split_indices:
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        confusion_matrix_sum += confusion_matrix(y_test, y_pred, labels=[0,1,2])
    print("Results for {} Classifier:".format(classifier_name))
    print("Accuracy mean:", np.mean(accuracy_scores))
    print("Accuracy std:", np.std(accuracy_scores))
    print("Confusion Matrix:")
    print_confusion_matrix(confusion_matrix_sum, ["Sitzen", "Laufen", "Fahrrad"])
    print()
    print()

Daten in Merkmale X und Label y aufteilen

In [21]:
X_train, y_train = split_X_y(data_windows_train)
X_test, y_test = split_X_y(data_windows_test)

4 verschiedene Klassifikationsalgorithmen ausprobieren

In [22]:
classifiers = [
    ("KNN", KNeighborsClassifier(10)),
    ("SVM", SVC(kernel="linear", C=0.025, random_state=0)),
    ("Decision Tree", DecisionTreeClassifier(max_depth=10, random_state=0)),
    ("Random Forest", RandomForestClassifier(max_depth=5, n_estimators=10, random_state=0))
]

Evaluation mit 10-Fold cross validation

In [23]:
for cls_name, cls in classifiers:
    split = KFold(10, shuffle=True, random_state=0).split(X_train)
    evaluate_classifier(X_train, y_train, split, cls, cls_name)

Results for KNN Classifier:
Accuracy mean: 0.9291666666666666
Accuracy std: 0.07643937725453896
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen   58.0      1.0       0.0    
 Laufen   2.0       23.0      0.0    
Fahrrad   0.0       3.0       0.0    


Results for SVM Classifier:
Accuracy mean: 0.9888888888888889
Accuracy std: 0.03333333333333335
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen   58.0      1.0       0.0    
 Laufen   0.0       25.0      0.0    
Fahrrad   0.0       0.0       3.0    


Results for Decision Tree Classifier:
Accuracy mean: 0.976388888888889
Accuracy std: 0.047324236215002285
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen   59.0      0.0       0.0    
 Laufen   1.0       24.0      0.0    
Fahrrad   0.0       1.0       2.0    


Results for Random Forest Classifier:
Accuracy mean: 0.976388888888889
Accuracy std: 0.047324236215002285
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen   59.0      0.0 

Evaluation mit leave-one-subject-out corss validation

In [24]:
for cls_name, cls in classifiers:
    split = LeaveOneGroupOut().split(X_train, groups=data_windows_train["subject"].to_numpy())
    evaluate_classifier(X_train, y_train, split, cls, cls_name)

Results for KNN Classifier:
Accuracy mean: 0.9105555555555556
Accuracy std: 0.060254091608417396
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen   57.0      2.0       0.0    
 Laufen   2.0       23.0      0.0    
Fahrrad   0.0       3.0       0.0    


Results for SVM Classifier:
Accuracy mean: 0.9444444444444444
Accuracy std: 0.06804138174397716
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen   58.0      1.0       0.0    
 Laufen   0.0       25.0      0.0    
Fahrrad   0.0       3.0       0.0    


Results for Decision Tree Classifier:
Accuracy mean: 0.9305555555555556
Accuracy std: 0.12028130608117203
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen   59.0      0.0       0.0    
 Laufen   2.0       23.0      0.0    
Fahrrad   0.0       3.0       0.0    


Results for Random Forest Classifier:
Accuracy mean: 0.9583333333333334
Accuracy std: 0.0721687836487032
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen   59.0      0.0 

Ergebnis: Random Forest erzieht die besten Ergebnisse  
Daher jetzt suche nach besten Hyperparametern

In [10]:
classifier = RandomForestClassifier(random_state=0)
parameters = {
    "max_depth": range(1,11), 
    "n_estimators": range(2,21)
}
gs = GridSearchCV(classifier, parameters, cv=5, iid=False, verbose=1, n_jobs=4)
gs.fit(X_train, y_train)
best_random_forest = gs.best_estimator_
print("Best Random Forest Model:")
print("max_depth: {}".format(best_random_forest.get_params()["max_depth"]))
print("n_estimators: {}".format(best_random_forest.get_params()["n_estimators"]))
print("Train Accuracy: {}".format(best_random_forest.score(X_train, y_train)))
print("Test Accuracy: {}".format(best_random_forest.score(X_test, y_test)))

Fitting 5 folds for each of 190 candidates, totalling 950 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 401 tasks      | elapsed:    2.3s


Best Random Forest Model:
max_depth: 2
n_estimators: 5
Train Accuracy: 1.0
Test Accuracy: 0.9743589743589743


[Parallel(n_jobs=4)]: Done 950 out of 950 | elapsed:    4.3s finished


Recursive Feature Elimination ausprobieren, um Dimensionalität der Eingabe zu reduzieren

In [11]:
classifier = RandomForestClassifier(max_depth=2, n_estimators=13, random_state=0)
rfe = RFE(classifier,10,step=1)
rfe.fit(X_train, y_train)
column_names = data_windows.drop(["context", "subject"], axis=1).columns
columns = [column_name for support, column_name in zip(rfe.support_,column_names) if support]
columns

['accelerationX_std',
 'accelerationY_min',
 'accelerationY_mean',
 'accelerationZ_min',
 'rotationRateAlpha_min',
 'rotationRateAlpha_mean',
 'rotationRateAlpha_std',
 'rotationRateBeta_max',
 'rotationRateGamma_max',
 'rotationRateGamma_std']

Bestes Modell als JavaScript exportieren

In [12]:
porter = Porter(best_random_forest, language='js')
export = porter.export(embed_data=True)
f = open("random_forest.js", "w")
f.write(export)
f.close()