In [1]:
import numpy as np 
import pandas as pd 
from scipy import signal
import matplotlib.pyplot as plt 
import math
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

#Commented by: James Xi Zheng 12/Aug/2019
#please create functions to do the following jobs
#1. load dataset ->  sample code availalable in the workshops
#2. visualize data -> sample code given
#3. remove signal noises -> sample code given
#4. extract features -> sample code given
#5. prepare training set -> sample code given 
#6. training the given models -> sample code given
#7. test the given models -> sample code given
#8. print out the evaluation results -> sample code given

#as I said in the lecture, the sample code is completed in a un-professional software engineering style
#software refactoring is required
#please manage the project using SCRUM sprints and manage the source code using Github
#document your progress and think critically what are missing from such IoT application and what are missing to move such IoT application from PoC (proof of concept) to solve real-world life
#think with which components added, what kind of real-world problems can be solved by it -> this shall be discussed in the conclusion part in the document


def load_dataset():
#this code iterates through all 19 datasets then displays the first 5 rows
    for x in range(1,20):
        df = pd.read_csv('dataset/dataset_'+str(x)+'.txt', sep=',', header=None)  # use pandas to read csv file
        print( 'Dataset ' + str(x) +' contains %d rows.' % len(df)) 
        print(df.head()) # show first 5 rows of the dataset
        

'''
This code graphs the sensor data of the wrist for all 19 participants for all 13 activities
'''
def data_visulization():
    print("These are the graphs of un-filtered data")
    #These are the the activities
    activitiesList = ['Sitting', 'Lying', 'Standing', 'Washing Dishes', 'Vacuuming', 'Sweeping', 'Walking', 'Ascending Stairs', 'Destending Stairs', 'Treadmill Running', 'Bicycling (50W)', 'Bicycling (100W)', 'Rope Jumping']
    # read each dataset file
    for y in range(1,20):
        df = pd.read_csv('dataset/dataset_'+str(y)+'.txt', sep=',', header=None)
        #then read each activity 
        for x in range(1, 14):
            print('Person '+ str(y) + ', Activity '+activitiesList[x-1])
            df_sitting = df[df[24] == x].values
            plt.plot(df_sitting[:, 0:3]) #displays accelerometer
            plt.show()
            plt.plot(df_sitting[:, 3:6]) #displays gyroscope data
            plt.show()
            
    print("Finished displaying un-filtered data")
    
'''
For raw sensor data, it usually contains noise that arises from different sources, such as sensor mis-
calibration, sensor errors, errors in sensor placement, or noisy environments. We could apply filter to remove noise of sensor data
to smooth data. In this example code, Butterworth low-pass filter is applied. 
'''
     
  
def noise_removing():
    print("These are the graphs after noise removal")
    activitiesList = ['Sitting', 'Lying', 'Standing', 'Washing Dishes', 'Vacuuming', 'Sweeping', 'Walking', 'Ascending Stairs', 'Destending Stairs', 'Treadmill Running', 'Bicycling (50W)', 'Bicycling (100W)', 'Rope Jumping']
    #read each dataset file
    for y in range(1,20):
        df = pd.read_csv('dataset/dataset_'+str(y)+'.txt', sep=',', header=None)
    # Butterworth low-pass filter. You could try different parameters and other filters. 
        #read each activity 
        for x in range(1, 14):
            #apply the  butterworth filter
            b, a = signal.butter(4, 0.04, 'low', analog=False)
            df_sitting = df[df[24] == x].values
            #apply the filtering to each column of code being displayed
            for i in range(6):
                df_sitting[:,i] = signal.lfilter(b, a, df_sitting[:, i])
            print('Person '+ str(y) + ', Activity '+activitiesList[x-1])
            #display the graphs of the sensor data for each activity
            plt.plot(df_sitting[:, 0:3])
            plt.show()
            plt.plot(df_sitting[:, 3:6])
            plt.show()
    

'''
To build a human activity recognition system, we need to extract features from raw data and create feature dataset for training 
machine learning models.

Please create new functions to implement your own feature engineering. The function should output training and testing dataset.
'''
#Mean, max, and standard deviation were the features used
def feature_engineering_example():
    training = np.empty(shape=(0,73)) #array of size 73 because 24 measurements * 3 features + 1 for activity
    testing = np.empty(shape=(0, 73))
    # read in each dataset file
    for i in range(19):
        df = pd.read_csv('dataset/dataset_' + str(i + 1) + '.txt', sep=',', header=None)
        print('deal with dataset ' + str(i + 1))
        #apply the filter, same as the above function
        for c in range(1, 14):
            activity_data = df[df[24] == c].values
            b, a = signal.butter(4, 0.04, 'low', analog=False)
            for j in range(24):
                activity_data[:, j] = signal.lfilter(b, a, activity_data[:, j])
            
            #seperate the training and testing data in an 80/20 split
            datat_len = len(activity_data)
            training_len = math.floor(datat_len * 0.8)
            training_data = activity_data[:training_len, :]
            testing_data = activity_data[training_len:, :]

            # segment in to 1000 data points
            training_sample_number = training_len // 1000 + 1
            testing_sample_number = (datat_len - training_len) // 1000 + 1

            #make the sample data
            #extract features for all 24 columns of data, for all 19 participants
            #specifically, extracting the min, max and standard deviation of each column
            for s in range(training_sample_number):
                if s < training_sample_number - 1:
                    sample_data = training_data[1000*s:1000*(s + 1), :]
                else:
                    sample_data = training_data[1000*s:, :]
                training = trainingtestingData(training, sample_data) #make a training array to later be put in a csv file
                
            
            for s in range(testing_sample_number):
                if s < training_sample_number - 1:
                    sample_testingData = testing_data[1000*s:1000*(s + 1), :]
                else:
                    sample_testingData = testing_data[1000*s:, :]

                testing = trainingtestingData(testing, sample_testingData) #make a testing array to later be put in a csv file

    df_training = pd.DataFrame(training)
    df_testing = pd.DataFrame(testing)
    #put the data in to CSV files to be read later
    df_training.to_csv('training_data.csv', sep=',', header=None)
    df_testing.to_csv('testing_data.csv', sep=',', header=None)
    
                    
        
 #re factored code to not be duplicating       
def trainingtestingData(concat, data):
    sample = []
    for p in range(24):
                    sample.append(np.min(data[:, p]))
                    sample.append(np.max(data[:, p]))
                    sample.append(np.std(data[:, p]))
    sample.append(data[0, -1])
    sample = np.array([sample])
    return np.concatenate((concat, sample), axis=0)

'''
When we have training and testing feature set, we could build machine learning models to recognize human activities.

Please create new functions to fit your features and try other models.
'''
#This function helped me refactor the code, it essentially makes the y_train and y_test data of
def convertData(shape, file):
    data = file[shape].values
    data = data - 1
    data = data.astype(int)
    return data

def model_training_and_evaluation_KNN():
    
        df_training = pd.read_csv('training_data.csv', header=None)
        df_testing = pd.read_csv('testing_data.csv', header=None)
        
        #make an array of training values
        #refactored the code due to duplication
        y_train = convertData(df_training.shape[1] - 1, df_training)
        df_training = df_training.drop([df_training.shape[1] - 1], axis=1)
        X_train = df_training.values
        print ("training input done ")
        #make an array of testing values
        #refactored the code due to duplication
        y_test = convertData(df_testing.shape[1] - 1,df_testing)
        df_testing = df_testing.drop([df_testing.shape[1] - 1], axis=1)
        X_test = df_testing.values
        print ("testing input done")
        # Feature normalization for improving the performance of machine learning models. In this example code, 
            # StandardScaler is used to scale original feature to be centered around zero. You could try other normalization methods.
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        print ("feature normlization done")

         # Build KNN classifier, with 4 neighbours
        knn = KNeighborsClassifier(n_neighbors=4)
        knn.fit(X_train, y_train)

        y_pred = knn.predict(X_test)
        #print out the accuracy of the model
        print('Accuracy: ', accuracy_score(y_test, y_pred))
            # We could use confusion matrix to view the classification for each activity.
        print(confusion_matrix(y_test, y_pred))


def model_training_and_evaluation_SVC():
    
        df_training = pd.read_csv('training_data.csv', header=None)
        df_testing = pd.read_csv('testing_data.csv', header=None)
        
        #make an array of training value
        y_train = convertData(df_training.shape[1] - 1, df_training)
        df_training = df_training.drop(df_training.shape[1] - 1, axis=1)
        
        X_train = df_training.values
        print ("training input done ")
        #make an array of testing values
        #refractored the code due to duplication
        y_test = convertData(df_training.shape[1] - 1, df_training)
        df_testing = df_testing.drop(df_training.shape[1] - 1, axis=1)
        X_test = df_testing.values
        print ("testing input done")
        # Feature normalization for improving the performance of machine learning models. 
            # StandardScaler is used to scale original feature to be centered around zero. 
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        print ("feature normlization done")
        #these are the parameters for the SVC model
        tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-1,1e-2, 1e-3, 1e-4],
                         'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 100]},
                        {'kernel': ['linear'], 'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}]
        acc_scorer = make_scorer(accuracy_score)
        grid_obj  = GridSearchCV(SVC(), tuned_parameters, cv=10, scoring=acc_scorer)
        grid_obj  = grid_obj .fit(X_train, y_train)
        clf = grid_obj.best_estimator_
        print('best clf:', clf)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        #print out the accuracy and confsuion matrix
        print('Accuracy: ', accuracy_score(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        
    

if __name__ == '__main__':
    
    #load_dataset() 
    #data_visulization()
    #noise_removing()
    feature_engineering_example()
    model_training_and_evaluation_KNN()
    #model_training_and_evaluation_SVC()

deal with dataset 1
deal with dataset 2
deal with dataset 3
deal with dataset 4
deal with dataset 5
deal with dataset 6
deal with dataset 7
deal with dataset 8
deal with dataset 9
deal with dataset 10
deal with dataset 11
deal with dataset 12
deal with dataset 13
deal with dataset 14
deal with dataset 15
deal with dataset 16
deal with dataset 17
deal with dataset 18
deal with dataset 19
