In [15]:
import numpy as np 
import pandas as pd 
from scipy import signal
import matplotlib.pyplot as plt 
import math
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
 df = pd.read_csv('dataset/dataset_1.txt', sep=',', header=None)

In [3]:
def data_visulization():
    # read dataset file
    df_sitting = df[df[24] == 1].values
    plt.plot(df_sitting[:5000, 18:21])
    plt.plot(df_sitting[:5000, 22:25])
    plt.show()
    df_lying = df[df[24] == 2].values
    plt.plot(df_lying[:5000,18:21])
    plt.plot(df_lying[:5000,22:25])
    plt.show()
    df_standing = df[df[24] == 3].values
    plt.plot(df_standing[:5000, 18:21])
    plt.plot(df_standing[:5000, 22:25 ])
    plt.show()
    df_washingdish = df[df[24] == 4].values
    plt.plot(df_washingdish[:5000, 18:21])
    plt.plot(df_washingdish[:5000, 22:25])
    plt.show()
    df_vacuum = df[df[24] == 5].values
    plt.plot(df_vacuum[:5000, 18:21])
    plt.plot(df_vacuum[:5000, 22:25])
    plt.show()
    df_sweep = df[df[24] == 6].values
    plt.plot(df_sweep[:5000, 18:21])
    plt.plot(df_sweep[:5000, 22:25])
    plt.show()
    df_walking = df[df[24] == 7].values
    plt.plot(df_walking[:5000, 18:21])
    plt.plot(df_walking[:5000, 22:25])
    plt.show()
    df_ascenStair = df[df[24] == 8].values
    plt.plot(df_ascenStair[:5000, 18:21])
    plt.plot(df_ascenStair[:5000, 22:25])
    plt.show()
    df_decenStair = df[df[24] == 9].values
    plt.plot(df_decenStair[:5000, 18:21])
    plt.plot(df_decenStair[:5000, 22:25])
    plt.show()
    df_treadmill = df[df[24] == 10].values
    plt.plot(df_treadmill[:5000, 18:21])
    plt.plot(df_treadmill[:5000, 22:25])
    plt.show()
    df_bicycling50 = df[df[24] == 11].values
    plt.plot(df_bicycling50[:5000, 18:21]) #data from accelerometer axis of ankle device
    plt.plot(df_bicycling50[:5000, 22:25]) #data from gyroscope axis of ankle device
    plt.show()
    df_bicycling100 = df[df[24] == 12].values
    plt.plot(df_bicycling100[:5000, 18:21])
    plt.plot(df_bicycling100[:5000, 22:25])
    plt.show()
    df_ropejump = df[df[24] == 13].values
    plt.plot(df_ropejump[:5000, 18:21])
    plt.plot(df_ropejump[:5000, 22:25])

In [4]:
def noise_removing():
    # Butterworth low-pass filter. You could try different parameters and other filters. 
    b, a = signal.butter(4, 0.04, 'low', analog = False)
    df_sitting = df[df[24] == 1].values
    df_lying = df[df[24] == 2].values
    df_standing = df[df[24] == 3].values
    df_washingdish = df[df[24] == 4].values
    df_vacuum = df[df[24] == 5].values
    df_sweep = df[df[24] == 6].values
    df_walking = df[df[24] == 7].values
    df_ascenStair = df[df[24] == 8].values
    df_decenStair = df[df[24] == 9].values
    df_treadmill = df[df[24] == 10].values
    df_bicycling50 = df[df[24] == 11].values
    df_bicycling100 = df[df[24] == 12].values
    df_ropejump = df[df[24] == 12].values
    for i in range(18,25):
        df_sitting[:,i] = signal.lfilter(b, a, df_sitting[:, i])
        df_lying[:,i] = signal.lfilter(b, a, df_lying[:, i])
        df_standing[:,i] = signal.lfilter(b, a, df_standing[:, i])
        df_washingdish[:,i] = signal.lfilter(b, a, df_washingdish[:, i])
        df_vacuum[:,i] = signal.lfilter(b, a, df_vacuum[:, i])
        df_sweep[:,i] = signal.lfilter(b, a, df_sweep[:, i])
        df_walking[:,i] = signal.lfilter(b, a, df_walking[:, i])
        df_ascenStair[:,i] = signal.lfilter(b, a, df_ascenStair[:, i])
        df_decenStair[:,i] = signal.lfilter(b, a, df_decenStair[:, i])
        df_treadmill[:,i] = signal.lfilter(b, a, df_treadmill[:, i])
        df_bicycling50[:,i] = signal.lfilter(b, a, df_bicycling50[:, i])
        df_bicycling100[:,i] = signal.lfilter(b, a, df_bicycling100[:, i])
        df_ropejump[:,i] = signal.lfilter(b, a, df_ropejump[:, i])

In [33]:
def feature_engineering():
    training = np.empty(shape=(0, 22))
    testing = np.empty(shape=(0, 22))
    # deal with each dataset file
    for i in range(19):
        df = pd.read_csv('dataset/dataset_' + str(i + 1) + '.txt', sep=',', header=None)
        print('deal with dataset ' + str(i + 1))
        for c in range(1, 14):
            activity_data = df[df[24] == c].values
            b, a = signal.butter(4, 0.04, 'low', analog=False)
            for j in range(24):
                activity_data[:, j] = signal.lfilter(b, a, activity_data[:, j])
            
            datat_len = len(activity_data)
            training_len = math.floor(datat_len * 0.8)
            training_data = activity_data[:training_len, :]
            testing_data = activity_data[training_len:, :]

            # data segementation: for time series data, we need to segment the whole time series, and then extract features from each period of time
            # to represent the raw data. In this example code, we define each period of time contains 1000 data points. Each period of time contains 
            # different data points. You may consider overlap segmentation, which means consecutive two segmentation share a part of data points, to 
            # get more feature samples.
            training_sample_number = training_len // 1000 + 1
            testing_sample_number = (datat_len - training_len) // 1000 + 1

            for s in range(training_sample_number):
                if s < training_sample_number - 1:
                    sample_data = training_data[1000*s:1000*(s + 1), :]
                else:
                    sample_data = training_data[1000*s:, :]
                # in this example code, only three accelerometer data in wrist sensor is used to extract three simple features: min, max, and mean value in
                # a period of time. Finally we get 9 features and 1 label to construct feature dataset. You may consider all sensors' data and extract more

                feature_sample = []
                for i in range(18,25):
                    feature_sample.append(np.min(sample_data[:, i]))
                    feature_sample.append(np.max(sample_data[:, i]))
                    feature_sample.append(np.mean(sample_data[:, i]))
                feature_sample.append(sample_data[0, -1])
                feature_sample = np.array([feature_sample])
                #print(feature_sample)
                training = np.concatenate((training, feature_sample), axis=0)
            
            for s in range(testing_sample_number):
                if s < training_sample_number - 1:
                    sample_data = testing_data[1000*s:1000*(s + 1), :]
                else:
                    sample_data = testing_data[1000*s:, :]

                feature_sample = []
                for i in range(18,25):
                    feature_sample.append(np.min(sample_data[:, i]))
                    feature_sample.append(np.max(sample_data[:, i]))
                    feature_sample.append(np.mean(sample_data[:, i]))
                feature_sample.append(sample_data[0, -1])
                feature_sample = np.array([feature_sample])
                testing = np.concatenate((testing, feature_sample), axis=0)

    df_training = pd.DataFrame(training)
    df_testing = pd.DataFrame(testing)
    df_training.to_csv('training_data.csv', index=None, header=None)
    df_testing.to_csv('testing_data.csv', index=None, header=None)


In [34]:
if __name__ == '__main__':
    feature_engineering()

deal with dataset 1
deal with dataset 2
deal with dataset 3
deal with dataset 4
deal with dataset 5
deal with dataset 6
deal with dataset 7
deal with dataset 8
deal with dataset 9
deal with dataset 10
deal with dataset 11
deal with dataset 12
deal with dataset 13
deal with dataset 14
deal with dataset 15
deal with dataset 16
deal with dataset 17
deal with dataset 18
deal with dataset 19
