# Training Notebook
Notebook enthält Code zur Modellselektion und Training für die Übungsaufgabe der Vorlesung Kontextsensitive Systeme.

In [10]:
from load_data import load_data_from_influxdb, create_data_windows, split_X_y

import pandas as pd
import numpy as np

from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold, LeaveOneGroupOut, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn_porter import Porter

Sensordaten aus InfluxDB lesen und in pandas dataframe speichern

In [11]:
sensor_data = load_data_from_influxdb()
sensor_data.head()

Unnamed: 0_level_0,accel_magnitude,accelerationIncludingGravityX,accelerationIncludingGravityY,accelerationIncludingGravityZ,accelerationX,accelerationY,accelerationZ,alpha,beta,context,gamma,rotationRateAlpha,rotationRateBeta,rotationRateGamma,subject
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-10-11 07:21:55.310,2.137617,6.727186,-3.571412,-1.611477,1.592222,-0.855081,0.87485,166.498367,-141.032441,Laufen,59.32855,-2.721634,-122.218491,137.415077,Sub1
2019-10-11 07:21:55.408,7.204752,13.063213,-8.336831,0.455985,6.11937,-2.398366,2.310062,182.140161,-140.776227,Laufen,75.212277,-94.826815,-66.347152,223.238121,Sub1
2019-10-11 07:21:55.514,13.743447,20.764571,-11.685181,1.381943,12.578234,-1.701651,4.626536,166.315054,-131.606657,Laufen,68.579714,-197.857991,-25.504385,259.23368,Sub1
2019-10-11 07:21:55.609,15.249874,19.491302,-8.650409,5.045149,11.269765,2.904983,9.645309,140.867588,-129.190825,Laufen,60.759184,-242.591754,102.758997,201.188079,Sub1
2019-10-11 07:21:55.709,14.255147,11.730299,-5.042175,8.092641,3.360819,5.594839,12.474462,121.069633,-131.611198,Laufen,62.400513,-157.908493,62.966491,96.534442,Sub1


Daten in Fenster von 2 Sekunden aufteilen und einfache Features berechnen (min, max, mean, std)

In [12]:
data_windows = create_data_windows(sensor_data, '2000ms')
data_windows.head()
data_windows.shape

(494, 54)

In [13]:
data_windows_train = data_windows[~data_windows["subject"].isin(["Sub1", "Sub2"])]
data_windows_test = data_windows[data_windows["subject"].isin(["Sub1", "Sub2"])]

Funktion zur Modellevaluation definieren

In [14]:
def print_confusion_matrix(matrix, labels):
    print(" " * 7, end='')
    for label in labels:
        print("{:^10}".format(label), end='')
    print()
    for i, label in enumerate(labels):
        print("{:>7}".format(label), end='')
        for j in range(matrix.shape[1]):
            print("{:^10}".format(matrix[i,j]), end='')
        print()

def evaluate_classifier(X, y, split_indices, classifier, classifier_name):
    accuracy_scores = []
    confusion_matrix_sum = np.zeros((3,3))

    for (train_index, test_index) in split_indices:
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        confusion_matrix_sum += confusion_matrix(y_test, y_pred, labels=[0,1,2])
    print("Results for {} Classifier:".format(classifier_name))
    print("Accuracy mean:", np.mean(accuracy_scores))
    print("Accuracy std:", np.std(accuracy_scores))
    print("Confusion Matrix:")
    print_confusion_matrix(confusion_matrix_sum, ["Sitzen", "Laufen", "Fahrrad"])
    print()
    print()

Daten in Merkmale X und Label y aufteilen

In [15]:
X_train, y_train = split_X_y(data_windows_train)
X_test, y_test = split_X_y(data_windows_test)

4 verschiedene Klassifikationsalgorithmen ausprobieren

In [16]:
classifiers = [
    ("KNN", KNeighborsClassifier(10)),
    ("SVM", SVC(kernel="linear", C=0.025, random_state=1)),
    ("Decision Tree", DecisionTreeClassifier(max_depth=10, random_state=1)),
    ("Random Forest", RandomForestClassifier(max_depth=5, n_estimators=10, random_state=1))
]

Evaluation mit 10-Fold cross validation

In [17]:
for cls_name, cls in classifiers:
    split = KFold(10, shuffle=True, random_state=0).split(X_train)
    evaluate_classifier(X_train, y_train, split, cls, cls_name)

Results for KNN Classifier:
Accuracy mean: 0.9625177809388334
Accuracy std: 0.024467088717289844
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen  102.0      10.0      0.0    
 Laufen   1.0      117.0      0.0    
Fahrrad   2.0       1.0      142.0   


Results for SVM Classifier:
Accuracy mean: 0.9678520625889048
Accuracy std: 0.020116332952945713
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen  108.0      2.0       2.0    
 Laufen   1.0      115.0      2.0    
Fahrrad   3.0       2.0      140.0   


Results for Decision Tree Classifier:
Accuracy mean: 0.9866998577524895
Accuracy std: 0.013302424097392618
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen  111.0      1.0       0.0    
 Laufen   1.0      116.0      1.0    
Fahrrad   1.0       1.0      143.0   


Results for Random Forest Classifier:
Accuracy mean: 0.9920341394025606
Accuracy std: 0.012169438732282534
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen  112.0      

Evaluation mit leave-one-subject-out corss validation

In [18]:
for cls_name, cls in classifiers:
    split = LeaveOneGroupOut().split(X_train, groups=data_windows_train["subject"].to_numpy())
    evaluate_classifier(X_train, y_train, split, cls, cls_name)

Results for KNN Classifier:
Accuracy mean: 0.9420374334827213
Accuracy std: 0.0352982529053315
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen  102.0      10.0      0.0    
 Laufen   8.0      110.0      0.0    
Fahrrad   2.0       1.0      142.0   


Results for SVM Classifier:
Accuracy mean: 0.8205363601189029
Accuracy std: 0.1486634631793104
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen  105.0      2.0       5.0    
 Laufen   14.0      77.0      27.0   
Fahrrad   7.0       11.0     127.0   


Results for Decision Tree Classifier:
Accuracy mean: 0.9834643174428122
Accuracy std: 0.009440025640068045
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen  111.0      1.0       0.0    
 Laufen   1.0      115.0      2.0    
Fahrrad   1.0       1.0      143.0   


Results for Random Forest Classifier:
Accuracy mean: 0.9911965811965812
Accuracy std: 0.01471600292884478
Confusion Matrix:
         Sitzen    Laufen   Fahrrad  
 Sitzen  112.0      0.0  

Ergebnis: Random Forest erzieht die besten Ergebnisse  
Daher jetzt suche nach besten Hyperparametern

In [19]:
classifier = RandomForestClassifier(random_state=1)
parameters = {
    "max_depth": range(1,11), 
    "n_estimators": range(2,21)
}
gs = GridSearchCV(classifier, parameters, cv=5, iid=False, verbose=1, n_jobs=4)
gs.fit(X_train, y_train)
best_random_forest = gs.best_estimator_
print("Best Random Forest Model:")
print("max_depth: {}".format(best_random_forest.get_params()["max_depth"]))
print("n_estimators: {}".format(best_random_forest.get_params()["n_estimators"]))
print("Train Accuracy: {}".format(best_random_forest.score(X_train, y_train)))
print("Test Accuracy: {}".format(best_random_forest.score(X_test, y_test)))

Fitting 5 folds for each of 190 candidates, totalling 950 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 427 tasks      | elapsed:    2.5s


Best Random Forest Model:
max_depth: 3
n_estimators: 5
Train Accuracy: 0.9946666666666667
Test Accuracy: 1.0


[Parallel(n_jobs=4)]: Done 950 out of 950 | elapsed:    4.8s finished


Recursive Feature Elimination ausprobieren, um Dimensionalität der Eingabe zu reduzieren

In [20]:
classifier = RandomForestClassifier(max_depth=2, n_estimators=13, random_state=1)
rfe = RFE(classifier,10,step=1)
rfe.fit(X_train, y_train)
column_names = data_windows.drop(["context", "subject"], axis=1).columns
columns = [column_name for support, column_name in zip(rfe.support_,column_names) if support]
columns

['accel_magnitude_max',
 'accel_magnitude_mean',
 'accelerationIncludingGravityY_min',
 'accelerationIncludingGravityY_std',
 'accelerationX_std',
 'accelerationY_min',
 'accelerationY_std',
 'accelerationZ_min',
 'accelerationZ_mean',
 'accelerationZ_std']

Bestes Modell als JavaScript exportieren

In [21]:
porter = Porter(best_random_forest, language='js')
export = porter.export(embed_data=True)
f = open("random_forest.js", "w")
f.write(export)
f.close()