In [40]:
import numpy as np 
import pandas as pd 
from scipy import signal
import matplotlib.pyplot as plt 
import math
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [41]:
def data_loading():
    #load data from 19 participants
     for i in range(19):
        df = pd.read_csv('dataset/dataset_' + str(i + 1) + '.txt', sep=',', header=None)

In [42]:
def data_visulization():
    # read dataset file
    for i in range(1,14):
        df_activities = df[df[24] == i].values
        plt.plot(df_activities[:5000,18:21])
        plt.plot(df_activities[:5000,22:25])
        plt.show()

In [43]:
def noise_removing():
    #Butterworth Low-pass
    b, a = signal.butter(4, 0.04, 'low', analog = False)
    for c in range(1, 14):
        activity_data = df[df[24] == c].values
        for j in range(24):
            activity_data[:, j] = signal.lfilter(b, a, activity_data[:, j])

In [44]:
def feature_engineering():
    training = np.empty(shape=(0, 22))
    testing = np.empty(shape=(0, 22))
    # deal with each dataset file
    for i in range(19):
        df = pd.read_csv('dataset/dataset_' + str(i + 1) + '.txt', sep=',', header=None)
        print('deal with dataset ' + str(i + 1))
        for c in range(1, 14):
            activity_data = df[df[24] == c].values
            b, a = signal.butter(4, 0.04, 'low', analog=False)
            for j in range(24):
                activity_data[:, j] = signal.lfilter(b, a, activity_data[:, j])
            
            datat_len = len(activity_data)
            training_len = math.floor(datat_len * 0.8)
            training_data = activity_data[:training_len, :]
            testing_data = activity_data[training_len:, :]

            training_sample_number = training_len // 1000 + 1
            testing_sample_number = (datat_len - training_len) // 1000 + 1

            for s in range(training_sample_number):
                if s < training_sample_number - 1:
                    sample_data = training_data[1000*s:1000*(s + 1), :]
                else:
                    sample_data = training_data[1000*s:, :]
                feature_sample = []
                for i in range(18,25):
                    feature_sample.append(np.min(sample_data[:, i]))
                    feature_sample.append(np.max(sample_data[:, i]))
                    feature_sample.append(np.mean(sample_data[:, i]))
                feature_sample.append(sample_data[0, -1])
                feature_sample = np.array([feature_sample])
                training = np.concatenate((training, feature_sample), axis=0)
            
            for s in range(testing_sample_number):
                if s < training_sample_number - 1:
                    sample_data = testing_data[1000*s:1000*(s + 1), :]
                else:
                    sample_data = testing_data[1000*s:, :]

                feature_sample = []
                for i in range(18,25):
                    feature_sample.append(np.min(sample_data[:, i]))
                    feature_sample.append(np.max(sample_data[:, i]))
                    feature_sample.append(np.mean(sample_data[:, i]))
                feature_sample.append(sample_data[0, -1])
                feature_sample = np.array([feature_sample])
                testing = np.concatenate((testing, feature_sample), axis=0)

    df_training = pd.DataFrame(training)
    df_testing = pd.DataFrame(testing)
    df_training.to_csv('training_data.csv', index=None, header=None)
    df_testing.to_csv('testing_data.csv', index=None, header=None)


In [45]:
def model_training_and_evaluation_example():
    df_training = pd.read_csv('training_data.csv', header=None)
    df_testing = pd.read_csv('testing_data.csv', header=None)

    y_train = df_training[19].values
    y_train = y_train - 1
    df_training = df_training.drop([19], axis=1)
    X_train = df_training.values

    y_test = df_testing[19].values
    y_test = y_test - 1
    df_testing = df_testing.drop([19], axis=1)
    X_test = df_testing.values
    # StandardScaler for data normalization
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Build KNN classifier
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    # confusion matrix
    print(confusion_matrix(y_test, y_pred))
    #SVM classifier
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-1,1e-2, 1e-3, 1e-4],
                     'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 100]},
                    {'kernel': ['linear'], 'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}]
    acc_scorer = make_scorer(accuracy_score)
    grid_obj  = GridSearchCV(SVC(), tuned_parameters, cv=10, scoring=acc_scorer)
    grid_obj  = grid_obj .fit(X_train, y_train)
    clf = grid_obj.best_estimator_
    print('best clf:', clf)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

In [None]:
if __name__ == '__main__':
   # feature_engineering()
    model_training_and_evaluation_example()

Accuracy:  0.9328214971209213
[[ 57   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  57   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0  57   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  96   2   1   0   0   0   0   0   0   0]
 [  0   0   1   5  30  21   0   0   0   0   0   0   0]
 [  0   0   0   2  16  65   0   2   0   0   0   0   0]
 [  0   0   0   0   1   7 211   0   0   0   0   0   0]
 [  0   0   0   0   0   1   1  37   0   0   0   0   0]
 [  0   0   0   0   0   1   1   0  36   0   0   0   0]
 [  0   0   0   0   0   0   1   0   0  95   0   0   0]
 [  0   0   0   0   0   0   1   0   1   0  98   0   0]
 [  0   0   0   0   0   0   2   0   0   0   0  98   0]
 [  0   0   0   0   0   0   2   0   0   0   0   1  35]]
