#Feature Engineering - Sliding Window, Several Models

Load libraries and connect to Google Drive to access dataset

In [3]:
#@title
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive
%cd /gdrive/MyDrive/CSC8635/
## Check that the data_subjects_info.csv is in the current folder location
#@title
## set up the libraries that are needed to run the analysis
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

Mounted at /gdrive
/gdrive
/gdrive/MyDrive/CSC8635


Code to load in data and add labels

In [4]:
#@title
#based on code: https://github.com/mmalekzadeh/motion-sense/blob/master/codes/gen_paper_codes/1_MotionSense_Trial.ipynb

import numpy as np
import pandas as pd
##_____________________________

def get_ds_infos():
    ## 0:Code, 1:Weight, 2:Height, 3:Age, 4:Gender
    dss = np.genfromtxt("data_subjects_info.csv",delimiter=',')
    dss = dss[1:]
    print("----> Data subjects information is imported.")
    return dss
##____________

def creat_time_series(num_features, num_act_labels, num_gen_labels, label_codes, trial_codes):
    dataset_columns = num_features+num_act_labels+num_gen_labels
    ds_list = get_ds_infos()
    train_data = np.zeros((0,dataset_columns))
    test_data = np.zeros((0,dataset_columns))
    for i, sub_id in enumerate(ds_list[:,0]):
        for j, act in enumerate(label_codes):
            for trial in trial_codes[act]:
                fname = 'A_DeviceMotion_data/A_DeviceMotion_data/'+act+'_'+str(trial)+'/sub_'+str(int(sub_id))+'.csv'
                raw_data = pd.read_csv(fname)
                raw_data = raw_data.drop(['Unnamed: 0'], axis=1)
                unlabel_data = raw_data.values
                label_data = np.zeros((len(unlabel_data), dataset_columns))
                label_data[:,:-(num_act_labels + num_gen_labels)] = unlabel_data
                label_data[:,label_codes[act]] = 1
                label_data[:,-(num_gen_labels)] = int(ds_list[i,4])
                ## We consider long trials as training dataset and short trials as test dataset
                if trial > 10:
                    test_data = np.append(test_data, label_data, axis = 0)
                else:    
                    train_data = np.append(train_data, label_data, axis = 0)
    return train_data , test_data
#________________________________


In [5]:
#@title
print("--> Start...")
## Here we set parameter to build labelled time-series from dataset of "(A)DeviceMotion_data"
num_features = 12 # attitude(roll, pitch, yaw); gravity(x, y, z); rotationRate(x, y, z); userAcceleration(x,y,z)
num_act_labels = 6 # dws, ups, wlk, jog, sit, std
num_gen_labels = 1 # 0/1(female/male)
label_codes = {"dws":num_features, "ups":num_features+1, "wlk":num_features+2, "jog":num_features+3, "sit":num_features+4, "std":num_features+5}
trial_codes = {"dws":[1,2,11], "ups":[3,4,12], "wlk":[7,8,15], "jog":[9,16], "sit":[5,13], "std":[6,14]}    
## Calling 'creat_time_series()' to build time-series

--> Start...


Load data - may take up to 4 minutes

In [6]:
#@title
print("--> Building Training and Test Datasets...")
train_ts, test_ts = creat_time_series(num_features, num_act_labels, num_gen_labels, label_codes, trial_codes)

--> Building Training and Test Datasets...
----> Data subjects information is imported.


In [75]:
#@title
print("--> Shape of Training Time-Series:", train_ts.shape)
print("--> Shape of Test Time-Series:", test_ts.shape)

--> Shape of Training Time-Series: (1081446, 19)
--> Shape of Test Time-Series: (331419, 19)


In [76]:
#@title
print('optional: channel-wise normalisation')
mn = np.mean(train_ts[:,:-1], axis=0)
std = np.std(train_ts[:,:-1], axis=0)
train_ts[:,:-1] = (train_ts[:,:-1]-mn)/std
test_ts[:,:-1] = (test_ts[:,:-1]-mn)/std

optional: channel-wise normalisation


In [77]:
#@title
cNum=12
win_len = 100
dim = train_ts.shape[1]-1 #minus the label col
print(dim)

18


Functions for sliding window and frame size

In [78]:
#@title
def sliding_window(Xy, percentage=0.5, win_len=100):
    #print('per')
    print('overlapping_percentage:  '+str(percentage*100)+'%')

    batch_size0 = int(np.floor(len(Xy)/win_len))
    #print(batch_size0)
    Xy0 = Xy[:win_len*batch_size0,:]
    if percentage==1:
        print('100% overlapping means no overlapping')
        return Xy0

    K_fold = int(1/(1-percentage))
    #print(K_fold)
    offset = int(np.round(win_len * (1-percentage)))
    Xy_big = []
    for k in range(K_fold):
        #print(k)
        current_set = Xy0[k*offset:len(Xy0)-(win_len-k*offset),:]
        #print(len(current_set))
        Xy_big.append(current_set)
    return np.concatenate(Xy_big)

def _to_frames(Xy, dim, win_len=100):
    X3D = np.reshape(Xy[:,:-1], (-1, win_len, dim))
    y2D = np.reshape(Xy[:,-1],(-1, win_len)).astype(int)
    y = np.zeros(len(y2D))
    for i in range(len(y2D)):
        counts = np.bincount(np.reshape(y2D[i, :], (-1)))
        y[i] = np.argmax(counts)
    return X3D, y

In [79]:
#@title
print('original training data shape', train_ts.shape)
X_train = sliding_window(train_ts, 0.5, win_len)
print('after overlapping sliding window', X_train.shape)
X_train0, y_train0 = _to_frames(X_train,dim, win_len)
print('converting to frames (50 samples per window) with size', X_train0.shape)

#print(test.shape)
Xy_test = sliding_window(test_ts, 0.5, win_len)
#print(test.shape)
X_test0, y_test0 = _to_frames(Xy_test,dim,win_len)

original training data shape (1081446, 19)
overlapping_percentage:  50.0%
after overlapping sliding window (2162600, 19)
converting to frames (50 samples per window) with size (21626, 100, 18)
overlapping_percentage:  50.0%


In [80]:
#@title
def building_balanced_DB(X, y, cNum, sample_number):
    print('each class will have the same sample number')
    X_new = np.empty([0, win_len, dim])
    y_new = np.empty([0])
    for i in range(cNum):
        if len(X[y==i])<sample_number:
            print('Error: not enough samples for class '+str(i) +'please choose a smaller number')
            break
        else:
            X_new = np.concatenate((X_new, X[y==i][:sample_number]), axis=0)
            #print(X_new.shape)
        
        y_new = np.concatenate((y_new, y[y==i][:sample_number]), axis=0)
        #print(y_new)
    return X_new, y_new

Balance the data

In [81]:
#@title
print('now it is the balanced data')
size_per_class_train = 100
size_per_class_test = 10
X_train0, y_train0 = building_balanced_DB(X_train0, y_train0, cNum, size_per_class_train)
X_test0, y_test0 = building_balanced_DB(X_test0, y_test0, cNum, size_per_class_test)

now it is the balanced data
each class will have the same sample number
Error: not enough samples for class 2please choose a smaller number
each class will have the same sample number
Error: not enough samples for class 2please choose a smaller number


In [82]:
#@title
print('original dimension:', X_train0.shape)
X_train = np.reshape(X_train0, (-1, dim*win_len))
X_test = np.reshape(X_test0, (-1, dim*win_len))
print('reshaped dimension:', X_train.shape)

y_train = y_train0.copy()
y_test = y_test0.copy()

original dimension: (200, 100, 18)
reshaped dimension: (200, 1800)


In [83]:
#@title
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from time import time
name = ' logistic_regression_classifier '
default_C = 1

t0 = time()
clf = LogisticRegression(penalty='l2',solver = 'lbfgs', C = default_C)
clf.fit(X_train, y_train)
print('training'+name+'takes {:.2f} seconds'.format(time()-t0))

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('classification accuracy: {:.2f}'.format(acc))
print('random  guess  accuracy: {:.2f}'.format(1/cNum))

training logistic_regression_classifier takes 0.12 seconds
classification accuracy: 0.75
random  guess  accuracy: 0.08


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Scale the training data

In [84]:
#@title
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(X_train)
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Run models on normalised data

In [85]:
#@title
from sklearn.metrics import accuracy_score, f1_score
names = ["Logistic Regression", "Linear SVM", 'RBF SVM', "Decision Tree", "Random Forest", "K-Nearest Neighbour"]

classifiers = [
    
    LogisticRegression(penalty='l2',solver = 'lbfgs', C = default_C),
    SVC(kernel="linear", C=default_C),
    SVC(kernel="rbf", C=default_C),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    KNeighborsClassifier(5)]

for name, clf in zip(names, classifiers):
    t0=time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test,y_pred)
    #,average ='macro')
    #f1_score(Y_test,LR_L1.predict(X_test),average ='macro'))
    acc = accuracy_score(y_test, y_pred)
    print(name+' accuracy: {:.2f}, f1 score: {:.2f}, total_time: {:.2f} sec'.format(acc, f1, time()-t0))

Logistic Regression accuracy: 0.75, f1 score: 0.67, total_time: 0.08 sec
Linear SVM accuracy: 0.75, f1 score: 0.67, total_time: 0.02 sec
RBF SVM accuracy: 1.00, f1 score: 1.00, total_time: 0.03 sec
Decision Tree accuracy: 0.95, f1 score: 0.95, total_time: 0.09 sec
Random Forest accuracy: 0.95, f1 score: 0.95, total_time: 0.29 sec
K-Nearest Neighbour accuracy: 0.70, f1 score: 0.57, total_time: 0.01 sec


Perform feature engineering to get features that can be:

1) linearly separable

2) low dimensional and structured

so all the above classifiers can be used.

In [86]:
#@title
from scipy.stats import kurtosis, skew, iqr, median_absolute_deviation 
def _FE(X):
    X_mean = np.mean(X, axis=1)
    X_std = np.std(X, axis=1)
    X_var = np.var(X, axis=1)
    X_energy = np.sqrt(np.mean(np.power(X, 2), axis=1))
    X_mad= np.mean(np.abs(X - np.expand_dims(X_mean, axis=1)), axis=1)
    X_mad1 = median_absolute_deviation(X, axis=1)
    X_kurtosis = kurtosis(X, axis = 1)
    X_skew = skew(X, axis = 1)
    X_iqr = iqr(X, axis = 1)
    X_max = np.amax(X, axis=1)
    X_min = np.amin(X, axis=1)
    
    return np.concatenate((X_mean 
                           ,X_std
                           ,X_var
                           ,X_energy
                           ,X_mad
                           ,X_mad1
                           ,X_kurtosis
                           ,X_skew
                           ,X_iqr
                           ,X_max
                           ,X_min
                          ), axis=1)

In [87]:
#@title
#print('performing feature engineering')
X_train = _FE(X_train0)
X_test = _FE(X_test0)
print('new feature dimension: '+str(X_train.shape[1]))
y_train = y_train0.copy()
y_test = y_test0.copy()

new feature dimension: 198


In [88]:
#@title
print('performing feature-wise normalisation!')
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(X_train)
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

performing feature-wise normalisation!


In [89]:
#@title
from sklearn.metrics import accuracy_score, f1_score
names = ["Logistic Regression", "Linear SVM", 'RBF SVM', "Decision Tree", "Random Forest", "K-Nearest Neighbour"]

classifiers = [
    
    LogisticRegression(penalty='l2',solver = 'lbfgs', C = default_C),
    SVC(kernel="linear", C=default_C),
    SVC(kernel="rbf", C=default_C),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    KNeighborsClassifier(5)]

for name, clf in zip(names, classifiers):
    t0=time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test,y_pred)
    #,average ='macro')
    #f1_score(Y_test,LR_L1.predict(X_test),average ='macro'))
    acc = accuracy_score(y_test, y_pred)
    print(name+' accuracy: {:.2f}, f1 score: {:.2f}, total_time: {:.2f} sec'.format(acc, f1, time()-t0))

Logistic Regression accuracy: 1.00, f1 score: 1.00, total_time: 0.02 sec
Linear SVM accuracy: 1.00, f1 score: 1.00, total_time: 0.01 sec
RBF SVM accuracy: 1.00, f1 score: 1.00, total_time: 0.01 sec
Decision Tree accuracy: 1.00, f1 score: 1.00, total_time: 0.01 sec
Random Forest accuracy: 1.00, f1 score: 1.00, total_time: 0.25 sec
K-Nearest Neighbour accuracy: 1.00, f1 score: 1.00, total_time: 0.00 sec


#Conclusions

The data cannot be balanced.
Feature engineering improves the accuracy.