In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
import time

In [2]:
#Load training dataset
train_set = pd.read_csv('./HG_Dataset/D_train.csv')

#Column names for filtering purposes
all_columns = ['X0','Y0','Z0','X1','Y1','Z1','X2','Y2','Z2','X3','Y3','Z3',\
           'X4','Y4','Z4','X5','Y5','Z5','X6','Y6','Z6','X7','Y7','Z7',\
           'X8','Y8','Z8','X9','Y9','Z9','X10','Y10','Z10','X11','Y11','Z11']
x_columns = ['X0','X1','X2','X3','X4','X5','X6','X7','X8','X9','X10','X11']
y_columns = ['Y0','Y1','Y2','Y3','Y4','Y5','Y6','Y7','Y8','Y9','Y10','Y11']
z_columns = ['Z0','Z1','Z2','Z3','Z4','Z5','Z6','Z7','Z8','Z9','Z10','Z11']
final_features = ['count','y_mean','z_mean','x_std','y_std','z_std',\
                 'x_max','y_max','z_max','x_min']

# Preprocessing & Feature Extraction

In [3]:
#Preprocessing: Delete the unique identifier column
train_set.drop(columns='Unnamed: 0', inplace=True)

In [4]:
#Explore dataset: Ranges of data are comparable, so standardization is not required, will check again after feature extraction
pd.set_option('display.max_columns', 500)
train_set.describe(include='all')

Unnamed: 0,Class,User,X0,Y0,Z0,X1,Y1,Z1,X2,Y2,Z2,X3,Y3,Z3,X4,Y4,Z4,X5,Y5,Z5,X6,Y6,Z6,X7,Y7,Z7,X8,Y8,Z8,X9,Y9,Z9,X10,Y10,Z10,X11,Y11,Z11
count,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13464.0,13464.0,13464.0,13254.0,13254.0,13254.0,11363.0,11363.0,11363.0,9616.0,9616.0,9616.0,7191.0,7191.0,7191.0,5733.0,5733.0,5733.0,4537.0,4537.0,4537.0,2482.0,2482.0,2482.0,0.0,0.0,0.0
mean,3.025481,5.777778,52.698144,85.770524,-31.826014,52.351641,86.272265,-31.711906,52.112129,83.898329,-32.956184,51.149287,82.018282,-33.071126,50.755154,80.962652,-33.806252,48.382155,81.697311,-30.821435,46.589561,82.693686,-29.266909,46.941151,85.865447,-24.9492,50.942798,84.373487,-28.776171,57.177513,83.183279,-28.576656,53.669544,77.761579,-29.641994,,,
std,1.445896,3.823397,32.676394,40.715829,33.898217,32.353705,40.641751,34.163914,33.570688,41.417587,34.226874,34.402447,42.019487,34.992982,34.565113,43.16936,34.775522,35.985091,43.419765,35.55816,36.604571,44.187079,34.613903,39.398134,42.863033,34.522784,39.103964,44.2666,35.921888,41.614585,43.932179,37.058562,40.719084,46.033896,42.029159,,,
min,1.0,0.0,-80.928512,-98.233756,-120.096446,-111.685241,-96.142589,-114.271334,-106.886524,-89.972754,-118.950653,-89.00991,-97.565346,-133.877193,-87.860871,-86.081022,-116.422479,-88.702402,-96.89239,-134.558324,-80.114463,-21.617589,-151.5922,-108.605639,-50.233962,-108.44019,-121.182089,-0.001324,-114.500502,-83.649652,-39.539982,-112.908778,-80.196289,-2.877761,-103.718071,,,
25%,2.0,2.0,31.260337,60.571185,-58.246757,30.917769,62.849522,-59.477546,29.413095,56.8235,-60.427889,26.154476,52.914401,-61.011937,23.932549,46.851103,-61.949685,17.024806,47.819909,-59.795263,15.453349,48.034701,-58.60583,13.998841,54.37858,-51.551124,20.915813,45.916796,-59.907133,27.634371,44.872205,-58.906215,14.298679,37.354595,-70.016161,,,
50%,3.0,6.0,55.336918,87.314804,-31.697639,55.415571,88.220022,-31.292537,56.038056,87.300265,-34.074548,55.710808,86.254136,-35.102632,55.34829,86.282664,-36.420433,52.719973,88.685799,-31.714577,51.569093,90.273587,-29.604648,51.989549,91.615694,-25.351387,59.602793,89.816121,-27.74183,63.516259,87.992227,-27.79479,62.781569,84.955399,-31.392546,,,
75%,4.0,9.0,75.288905,106.228314,-5.744637,75.03867,109.185672,-4.650574,75.422747,105.40377,-6.319491,75.318645,104.339838,-5.203043,75.744093,105.154207,-7.220871,75.260199,106.39075,-1.864195,75.712522,110.399266,-0.96568,78.660546,121.178163,1.477405,80.909939,126.338919,0.410206,86.090881,126.478059,1.340623,84.08825,121.913529,6.843351,,,
max,5.0,11.0,151.586035,168.717458,113.345119,151.271413,170.20935,104.618115,149.208278,167.973416,104.590879,151.033472,168.292018,114.624261,150.507099,167.095078,112.110711,146.031461,167.127478,106.528407,151.739265,167.275662,107.1766,148.500495,167.487393,110.053853,173.906643,167.035153,119.213101,174.054403,167.196644,122.569627,149.486224,168.352478,108.211488,,,


In [5]:
#Feature Extraction (Training): Non-NA count, mean for each axis, STD for each axis, min for each axis, max for each axis
#Count
train_set['count'] = train_set[all_columns].count(axis=1)

#Means
train_set['y_mean'] = train_set[y_columns].mean(axis=1)
train_set['z_mean'] = train_set[z_columns].mean(axis=1)

#Standard Deviations
train_set['x_std'] = train_set[x_columns].std(axis=1)
train_set['y_std'] = train_set[y_columns].std(axis=1)
train_set['z_std'] = train_set[z_columns].std(axis=1)

#Maximum
train_set['x_max'] = train_set[x_columns].max(axis=1)
train_set['y_max'] = train_set[y_columns].max(axis=1)
train_set['z_max'] = train_set[z_columns].max(axis=1)

#Minimum
train_set['x_min'] = train_set[x_columns].min(axis=1)

In [6]:
#Drop the original columns
train_set.drop(columns=all_columns, inplace=True)

In [7]:
#Explore the new features: The data is relatively on the same scale, so I choose not to standardize for now
train_set.describe()

Unnamed: 0,Class,User,count,y_mean,z_mean,x_std,y_std,z_std,x_max,y_max,z_max,x_min
count,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0,13500.0
mean,3.025481,5.777778,24.031111,80.373621,-34.50368,32.439537,39.682642,26.076963,93.476938,130.757534,-2.775697,1.818376
std,1.445896,3.823397,6.452347,17.246827,20.997457,8.230813,11.935065,12.011993,20.850893,26.737334,30.435922,22.985188
min,1.0,0.0,9.0,-50.301817,-92.767794,2.099186,2.939049,2.246756,-27.857612,-39.6632,-72.571532,-121.182089
25%,2.0,2.0,18.0,67.838747,-49.998885,27.351446,32.693702,17.526247,78.941544,127.423988,-25.165756,-13.215606
50%,3.0,6.0,24.0,81.495304,-36.156457,33.480391,41.432292,25.250417,92.44015,139.227723,-2.521009,-3.355784
75%,4.0,9.0,30.0,94.251337,-18.156529,36.57178,46.865052,34.350737,108.646575,149.616753,16.989548,13.176117
max,5.0,11.0,33.0,136.535435,70.051134,80.637048,72.982286,62.816154,174.054403,170.20935,122.569627,68.288056


In [8]:
#Feature reduction: I'm using all the extracted features first then will explore reduction

In [9]:
#Convert to numpy (training features, training labels, and user ID)
train_x = train_set[final_features].to_numpy()
train_y = train_set['Class'].to_numpy()
train_user = train_set['User'].to_numpy()

# Model Evaluation & Selection

## Naive Bayes

In [10]:
#Baseline Model: Naive Bayes
NB = GaussianNB()
NB.fit(train_x,train_y)
print('Training Accuracy: ', accuracy_score(train_y, NB.predict(train_x)))

Training Accuracy:  0.912962962962963


## SVM - RBF Kernel

In [11]:
#Range of gamma and C parameter values for best parameter selection using cross-validation
gamma = np.logspace(-7,-1,num=20)
c = np.logspace(-5,1,num=20)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5

#Arrays to store cross-validation accuracy results 
accuracies = np.zeros((trials*n,len(gamma),len(c)))

In [12]:
#Split all users
user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

In [13]:
#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of gamma and C combinations with startification on the one-out set
        for i in range(len(gamma)):
            for j in range(len(c)):
                svccv = SVC(kernel='rbf', gamma=gamma[i], C=c[j], cache_size=4000)
                svccv.fit(train_x_r, train_y_r)
                accuracies[k+N*9,i,j] = accuracy_score(test_y_r, svccv.predict(test_x_r))

In [14]:
accuracies = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracies.argmax(), accuracies.shape) #Index of max accuracy in terms of the original gamma and c arrays
print('Suitable pair:\tgamma=', gamma[x[0]], 'C=',c[x[1]])
print('\t\taccuracy=', accuracies[x])

Suitable pair:	gamma= 0.0001438449888287663 C= 4.832930238571752
		accuracy= 0.9297378277153562


In [15]:
svm = SVC(kernel='rbf', gamma=2.1544346900318823e-05, C=10)
svm.fit(train_x,train_y)
print('Training Accuracy: ', accuracy_score(train_y, svm.predict(train_x)))

Training Accuracy:  0.9949629629629629


## Perceptron

In [16]:
#Tolerance range
reg = np.logspace(-7,-1,num=1000)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5

#Arrays to store cross-validation accuracy results 
accuracies = np.zeros((trials*n,len(reg)))

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of gamma and C combinations with startification on the one-out set
        for i in range(len(reg)):
            perc = Perceptron(penalty='l2', alpha=reg[i], max_iter=10000000, shuffle=True)
            perc.fit(train_x_r, train_y_r)
            accuracies[k+N*9,i] = accuracy_score(test_y_r, perc.predict(test_x_r))

In [17]:
accuracies = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracies.argmax(), accuracies.shape) #Index of max accuracy in terms of the original gamma and c arrays
print('Suitable pair:\treg=', reg[x[0]])
print('\t\taccuracy=', accuracies[x])

Suitable pair:	reg= 1.1169868184678226e-07
		accuracy= 0.72521847690387


In [18]:
perc = Perceptron(tol=14.831025143361028, max_iter=10000000)
perc.fit(train_x, train_y)
print('Training Accuracy: ', accuracy_score(train_y, perc.predict(train_x)))

Training Accuracy:  0.742


## Linear Discriminant Analysis (LDA)

In [19]:
#Shrinkage range
shrinkage = np.logspace(-6,0,num=100)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5

#Arrays to store cross-validation accuracy results 
accuracies = np.zeros((trials*n,len(shrinkage)))

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of gamma and C combinations with startification on the one-out set
        for i in range(len(shrinkage)):
            lda = LinearDiscriminantAnalysis(shrinkage=shrinkage[i], solver='lsqr')
            lda.fit(train_x_r, train_y_r)
            accuracies[k+N*9,i] = accuracy_score(test_y_r, lda.predict(test_x_r))

In [20]:
accuracies = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracies.argmax(), accuracies.shape) #Index of max accuracy in terms of the original gamma and c arrays
print('Suitable pair:\treg=', shrinkage[x[0]])
print('\t\taccuracy=', accuracies[x])

Suitable pair:	reg= 0.0003511191734215131
		accuracy= 0.9046192259675403


In [21]:
lda = LinearDiscriminantAnalysis(shrinkage=0.0054622772176843425, solver='lsqr')
lda.fit(train_x, train_y)
print('Training Accuracy: ', accuracy_score(train_y, lda.predict(train_x)))

Training Accuracy:  0.9437037037037037


In [22]:
lda = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
lda.fit(train_x, train_y)
print('Training Accuracy: ', accuracy_score(train_y, lda.predict(train_x)))

Training Accuracy:  0.9328888888888889


## Quadratic Discriminant Analysis (QDA)

In [23]:
#Regularization range
reg = np.logspace(-6,0,num=1000)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5

#Arrays to store cross-validation accuracy results 
accuracies = np.zeros((trials*n,len(reg)))

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of gamma and C combinations with startification on the one-out set
        for i in range(len(reg)):
            qda = QuadraticDiscriminantAnalysis(reg_param=reg[i])
            qda.fit(train_x_r, train_y_r)
            accuracies[k+N*9,i] = accuracy_score(test_y_r, qda.predict(test_x_r))

In [24]:
accuracies = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracies.argmax(), accuracies.shape) #Index of max accuracy in terms of the original gamma and c arrays
print('Suitable pair:\treg=', reg[x[0]])
print('\t\taccuracy=', accuracies[x])

Suitable pair:	reg= 0.312964801067075
		accuracy= 0.8938826466916355


In [25]:
qda = QuadraticDiscriminantAnalysis(reg_param=0.17347593592339325)
qda.fit(train_x, train_y)
print('Training Accuracy: ', accuracy_score(train_y, qda.predict(train_x)))

Training Accuracy:  0.992


## KNN

In [26]:
#Neighbors range
neighbors = np.arange(1,400,1)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5

#Arrays to store cross-validation accuracy results 
accuracies = np.zeros((trials*n,len(neighbors)))

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of gamma and C combinations with startification on the one-out set
        for i in range(len(neighbors)):
            knn = KNeighborsClassifier(n_neighbors=neighbors[i],n_jobs=-1)
            knn.fit(train_x_r, train_y_r)
            accuracies[k+N*9,i] = accuracy_score(test_y_r, knn.predict(test_x_r))

In [27]:
accuracies = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracies.argmax(), accuracies.shape) #Index of max accuracy in terms of the original gamma and c arrays
print('Suitable pair:\treg=', neighbors[x[0]])
print('\t\taccuracy=', accuracies[x])

Suitable pair:	reg= 111
		accuracy= 0.7677902621722846


In [28]:
knn = KNeighborsClassifier(n_neighbors=1,n_jobs=-1,weights='distance')
knn.fit(train_x, train_y)
print('Training Accuracy: ', accuracy_score(train_y, knn.predict(train_x)))

Training Accuracy:  1.0


## SVM - Linear Kernel

In [29]:
#Range of C parameter values for best parameter selection using cross-validation
c = np.logspace(-5,1,num=50)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5

#Arrays to store cross-validation accuracy results 
accuracies = np.zeros((trials*n,len(c)))

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        for i in range(len(c)):
            svccv = SVC(kernel='linear', C=c[i], cache_size=1000)
            svccv.fit(train_x_r, train_y_r)
            accuracies[k+N*9,i] = accuracy_score(test_y_r, svccv.predict(test_x_r))

In [30]:
accuracies = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracies.argmax(), accuracies.shape) #Index of max accuracy in terms of the original gamma and c arrays
print('Suitable pair:\tC=', c[x[0]])
print('\t\taccuracy=', accuracies[x])

Suitable pair:	C= 0.00022229964825261955
		accuracy= 0.8953807740324594


In [31]:
svc = SVC(kernel='linear', C=0.001)
svc.fit(train_x, train_y)
print('Training Accuracy: ', accuracy_score(train_y, svc.predict(train_x)))

Training Accuracy:  0.9911851851851852


# Standardization

In [32]:
#Let's try to standardize our best model (SVM linear) - the SVM is scale variant so it might benefit
stand = preprocessing.StandardScaler().fit(train_x)
train_x_stand = stand.transform(train_x)

In [33]:
#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = stand.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = stand.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))

In [34]:
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x]) #No improvement noted

		accuracy= 0.8296


# Feature Selection

## ANOVA F-value - LDA

In [35]:
red = SelectKBest(f_classif, k=9).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
lda = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
lda.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.8934222222222222


In [36]:
red = SelectKBest(f_classif, k=8).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
lda = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
lda.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.8337777777777775


In [37]:
red = SelectKBest(f_classif, k=7).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
lda = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
lda.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.8232000000000003


In [38]:
#Features are already reduced on this file, these are just sample runs on additional feature reduction

## Mutual information (MI) -  LDA

In [39]:
red = SelectKBest(mutual_info_classif, k=9).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
lda = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
lda.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.8570666666666666


In [40]:
red = SelectKBest(mutual_info_classif, k=8).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
lda = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
lda.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.847111111111111


In [41]:
red = SelectKBest(mutual_info_classif, k=7).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
lda = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
lda.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.8311111111111111


In [42]:
#Two features removed also: zmin and xmean

## Combining the two results: 10 features - LDA

In [43]:
#Results already combined, this is just a sample run of approach
#train_x_reduced = train_x[:,np.array([0,2,3,4,5,6,7,8,9,10])]
#test_x_reduced = test_x[:,np.array([0,2,3,4,5,6,7,8,9,10])]
#lda = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
#lda.fit(train_x_reduced, train_y)
#print('Training Accuracy: ', accuracy_score(train_y, lda.predict(train_x_reduced)))

## ANOVA F-value - SVM

In [44]:
red = SelectKBest(f_classif, k=9).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
svc = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
svc.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.859111111111111


In [45]:
red = SelectKBest(f_classif, k=8).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
svc = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
svc.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.8488888888888888


In [46]:
red = SelectKBest(f_classif, k=7).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
svc = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
svc.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.7888888888888889


## Mutual information (MI) -  SVM

In [47]:
red = SelectKBest(mutual_info_classif, k=9).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
svc = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
svc.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.8475555555555556


In [48]:
red = SelectKBest(mutual_info_classif, k=8).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
svc = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
svc.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.8640000000000001


In [49]:
red = SelectKBest(mutual_info_classif, k=7).fit(train_x, train_y)
train_x_reduced = red.transform(train_x)
svc = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
svc.fit(train_x_reduced, train_y)

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 5
accuracies = np.zeros((trials*n))

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Cross-Validation
for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=50)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=50)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=50)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=50)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=50)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = red.transform(balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy())
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = red.transform(balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy())
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        #Cross-validate a range of C values with startification on the one-out set
        svccv = SVC(kernel='linear')
        svccv.fit(train_x_r, train_y_r)
        accuracies[k+N*9] = accuracy_score(test_y_r, svccv.predict(test_x_r))
accuracy = np.mean(accuracies,axis=0) #Average out the different cross-validation runs
x = np.unravel_index(accuracy.argmax(), accuracy.shape) #Index of max accuracy
print('\t\taccuracy=', accuracy[x])

		accuracy= 0.783111111111111


In [50]:
#Same results

## Combining the two results: 10 features - LDA

In [51]:
#Results already combined, this is just a test run of method
#train_x_reduced = train_x[:,np.array([0,2,3,4,5,6,7,8,9,10])]
#svc = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
#svc.fit(train_x_reduced, train_y)
#print('Training Accuracy: ', accuracy_score(train_y, svc.predict(train_x_reduced)))

In [52]:
#We can safely discard x-mean, z-min, and y-min throughout the analysis as they share information already in other features