In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.ensemble import VotingClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib as mpl
import matplotlib.pyplot as plt
import time

In [2]:
#Load training and testing datasets
train_set = pd.read_csv('./HG_Dataset/D_train.csv')
train_set_large = pd.read_csv('./HG_Dataset/D_train_large.csv')
test_set = pd.read_csv('./HG_Dataset/D_test.csv')

#Column names for filtering purposes
all_columns = ['X0','Y0','Z0','X1','Y1','Z1','X2','Y2','Z2','X3','Y3','Z3',\
           'X4','Y4','Z4','X5','Y5','Z5','X6','Y6','Z6','X7','Y7','Z7',\
           'X8','Y8','Z8','X9','Y9','Z9','X10','Y10','Z10','X11','Y11','Z11']
x_columns = ['X0','X1','X2','X3','X4','X5','X6','X7','X8','X9','X10','X11']
y_columns = ['Y0','Y1','Y2','Y3','Y4','Y5','Y6','Y7','Y8','Y9','Y10','Y11']
z_columns = ['Z0','Z1','Z2','Z3','Z4','Z5','Z6','Z7','Z8','Z9','Z10','Z11']
final_features = ['count','y_mean','z_mean','x_std','y_std','z_std',\
                 'x_max','y_max','z_max','x_min']

# Preprocessing & Feature Extraction

In [3]:
#Preprocessing: Delete the unique identifier column
train_set.drop(columns='Unnamed: 0', inplace=True)
test_set.drop(columns='Unnamed: 0', inplace=True)

In [4]:
#Feature Extraction (Training): Reduced set from the feature selection performed in previous run
#Count
train_set['count'] = train_set[all_columns].count(axis=1)

#Means
train_set['y_mean'] = train_set[y_columns].mean(axis=1)
train_set['z_mean'] = train_set[z_columns].mean(axis=1)

#Standard Deviations
train_set['x_std'] = train_set[x_columns].std(axis=1)
train_set['y_std'] = train_set[y_columns].std(axis=1)
train_set['z_std'] = train_set[z_columns].std(axis=1)

#Maximum
train_set['x_max'] = train_set[x_columns].max(axis=1)
train_set['y_max'] = train_set[y_columns].max(axis=1)
train_set['z_max'] = train_set[z_columns].max(axis=1)

#Minimum
train_set['x_min'] = train_set[x_columns].min(axis=1)

In [5]:
#Feature Extraction (Training - Large): Reduced set from the feature selection performed in previous run
#Count
train_set_large['count'] = train_set_large[all_columns].count(axis=1)

#Means
train_set_large['y_mean'] = train_set_large[y_columns].mean(axis=1)
train_set_large['z_mean'] = train_set_large[z_columns].mean(axis=1)

#Standard Deviations
train_set_large['x_std'] = train_set_large[x_columns].std(axis=1)
train_set_large['y_std'] = train_set_large[y_columns].std(axis=1)
train_set_large['z_std'] = train_set_large[z_columns].std(axis=1)

#Maximum
train_set_large['x_max'] = train_set_large[x_columns].max(axis=1)
train_set_large['y_max'] = train_set_large[y_columns].max(axis=1)
train_set_large['z_max'] = train_set_large[z_columns].max(axis=1)

#Minimum
train_set_large['x_min'] = train_set_large[x_columns].min(axis=1)

In [6]:
#Feature Extraction (Testing): Reduced set from the feature selection performed in previous run
#Count
test_set['count'] = test_set[all_columns].count(axis=1)

#Means
test_set['y_mean'] = test_set[y_columns].mean(axis=1)
test_set['z_mean'] = test_set[z_columns].mean(axis=1)

#Standard Deviations
test_set['x_std'] = test_set[x_columns].std(axis=1)
test_set['y_std'] = test_set[y_columns].std(axis=1)
test_set['z_std'] = test_set[z_columns].std(axis=1)

#Maximum
test_set['x_max'] = test_set[x_columns].max(axis=1)
test_set['y_max'] = test_set[y_columns].max(axis=1)
test_set['z_max'] = test_set[z_columns].max(axis=1)

#Minimum
test_set['x_min'] = test_set[x_columns].min(axis=1)

In [7]:
#Drop the original columns
train_set.drop(columns=all_columns, inplace=True)
train_set_large.drop(columns=all_columns, inplace=True)
test_set.drop(columns=all_columns, inplace=True)

In [8]:
#Convert to numpy (training features, training labels, and user ID)
train_x = train_set[final_features].to_numpy()
train_y = train_set['Class'].to_numpy()
train_user = train_set['User'].to_numpy()
train_x_large = train_set_large[final_features].to_numpy()
train_y_large = train_set_large['Class'].to_numpy()
train_user_large = train_set_large['User'].to_numpy()
test_x = test_set[final_features].to_numpy()
test_y = test_set['Class'].to_numpy()
test_user = test_set['User'].to_numpy()

# Model Evaluation & Selection

## Naive Bayes

In [9]:
#Cross-Validation

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 10

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Arrays to store cross-validation accuracy results 
cv_acc = np.zeros((trials*n))

for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        NB = GaussianNB()
        NB.fit(train_x_r, train_y_r)
        cv_acc[k+N*9] = accuracy_score(test_y_r, NB.predict(test_x_r))

In [10]:
#Baseline Model: Naive Bayes
NB = GaussianNB()
NB.fit(train_x,train_y)
print('Cross-Validation Accuracy: Mean:', np.mean(cv_acc,axis=0), ' STD: ',np.std(cv_acc,axis=0))
print('Training Accuracy: ', accuracy_score(train_y, NB.predict(train_x)))
print('Testing Accuracy: ', accuracy_score(test_y, NB.predict(test_x)))

Cross-Validation Accuracy: Mean: 0.787390761548065  STD:  0.15850156979235325
Training Accuracy:  0.912962962962963
Testing Accuracy:  0.8038295653822456


In [11]:
print('Confusion Matrix (Test):')
print(confusion_matrix(test_y, NB.predict(test_x)))
np.savetxt('NB.csv', confusion_matrix(test_y, NB.predict(test_x)), delimiter=',')

Confusion Matrix (Test):
[[4311   48  107    0    0]
 [  20 3675   19   28  660]
 [ 107    0 2622  145 1905]
 [   0  129    1 2935  849]
 [   0   27   77   17 3417]]


## SVM - RBF Kernel

In [12]:
#Cross-Validation

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 10

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Arrays to store cross-validation accuracy results 
cv_acc = np.zeros((trials*n))

for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        svm = SVC(kernel='rbf', gamma=0.0001438449888287663, C=4.832930238571752)
        svm.fit(train_x_r, train_y_r)
        cv_acc[k+N*9] = accuracy_score(test_y_r, svm.predict(test_x_r))

In [13]:
svm = SVC(kernel='rbf', gamma=0.0001438449888287663, C=4.832930238571752)
svm.fit(train_x,train_y)
print('Cross-Validation Accuracy: Mean:', np.mean(cv_acc,axis=0), ' STD: ',np.std(cv_acc,axis=0))
print('Training Accuracy: ', accuracy_score(train_y, svm.predict(train_x)))
print('Testing Accuracy: ', accuracy_score(test_y, svm.predict(test_x)))

Cross-Validation Accuracy: Mean: 0.9196754057428216  STD:  0.08613291667338556
Training Accuracy:  0.9991111111111111
Testing Accuracy:  0.8044931039385753


In [14]:
print('Confusion Matrix (Test):')
print(confusion_matrix(test_y, svm.predict(test_x)))
np.savetxt('svm.csv', confusion_matrix(test_y, svm.predict(test_x)), delimiter=',')

Confusion Matrix (Test):
[[4350   48    3    7   58]
 [  36 2979  538    0  849]
 [   0    0 4650   85   44]
 [   0    0  996 1801 1117]
 [   0   63    2  279 3194]]


## Perceptron

In [15]:
#Cross-Validation

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 10

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Arrays to store cross-validation accuracy results 
cv_acc = np.zeros((trials*n))

for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        perc = Perceptron(shuffle=True)
        perc.fit(train_x_r, train_y_r)
        cv_acc[k+N*9] = accuracy_score(test_y_r, perc.predict(test_x_r))

In [16]:
perc = Perceptron(shuffle=True)
perc.fit(train_x, train_y)
print('Cross-Validation Accuracy: Mean:', np.mean(cv_acc,axis=0), ' STD: ',np.std(cv_acc,axis=0))
print('Training Accuracy: ', accuracy_score(train_y, perc.predict(train_x)))
print('Testing Accuracy: ', accuracy_score(test_y, perc.predict(test_x)))

Cross-Validation Accuracy: Mean: 0.6847440699126092  STD:  0.12705674170182155
Training Accuracy:  0.7500740740740741
Testing Accuracy:  0.7514574150433669


In [17]:
print('Confusion Matrix (Test):')
print(confusion_matrix(test_y, perc.predict(test_x)))
np.savetxt('perc.csv', confusion_matrix(test_y, perc.predict(test_x)), delimiter=',')

Confusion Matrix (Test):
[[4231   48  163    0   24]
 [   2 4359   10    0   31]
 [  51  514 4136   21   57]
 [   1 2701  223  989    0]
 [ 163 1221   14    0 2140]]


In [18]:
#Testing the onevsone approach
ovo = OneVsOneClassifier(Perceptron())
ovo.fit(train_x, train_y)
print('Training Accuracy: ', accuracy_score(train_y, ovo.predict(train_x)))
print('Testing Accuracy: ', accuracy_score(test_y, ovo.predict(test_x)))

Training Accuracy:  0.9051851851851852
Testing Accuracy:  0.8138774349495237


## Linear Discriminant Analysis (LDA)

In [19]:
#Cross-Validation

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 10

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Arrays to store cross-validation accuracy results 
cv_acc = np.zeros((trials*n))

for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()
    
        lda = LinearDiscriminantAnalysis(solver='lsqr')
        lda.fit(train_x_r, train_y_r)
        cv_acc[k+N*9] = accuracy_score(test_y_r, lda.predict(test_x_r))

In [20]:
lda = LinearDiscriminantAnalysis(solver='lsqr') #All solvers produce the same result, shrinkage worsens results (samples >> features)
lda.fit(train_x, train_y)
print('Cross-Validation Accuracy: Mean:', np.mean(cv_acc,axis=0), ' STD: ',np.std(cv_acc,axis=0))
print('Training Accuracy: ', accuracy_score(train_y, lda.predict(train_x)))
print('Testing Accuracy: ', accuracy_score(test_y, lda.predict(test_x)))

Cross-Validation Accuracy: Mean: 0.9084893882646693  STD:  0.06565770843907735
Training Accuracy:  0.9328888888888889
Testing Accuracy:  0.9242618133560833


In [21]:
print('Confusion Matrix (Test):')
print(confusion_matrix(test_y, lda.predict(test_x)))
np.savetxt('lda.csv', confusion_matrix(test_y, lda.predict(test_x)), delimiter=',')

Confusion Matrix (Test):
[[4355   48   63    0    0]
 [  33 4247   35   85    2]
 [  48    0 3657 1074    0]
 [   0   59    0 3855    0]
 [   5   17    0  129 3387]]


## Quadratic Discriminant Analysis (QDA)

In [22]:
#Cross-Validation

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 10

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Arrays to store cross-validation accuracy results 
cv_acc = np.zeros((trials*n))

for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()

        qda = QuadraticDiscriminantAnalysis(reg_param=0.5008407989848213)
        qda.fit(train_x_r, train_y_r)
        cv_acc[k+N*9] = accuracy_score(test_y_r, qda.predict(test_x_r))

In [23]:
qda = QuadraticDiscriminantAnalysis(reg_param=0.5008407989848213)
qda.fit(train_x, train_y)
print('Cross-Validation Accuracy: Mean:', np.mean(cv_acc,axis=0), ' STD: ',np.std(cv_acc,axis=0))
print('Training Accuracy: ', accuracy_score(train_y, qda.predict(train_x)))
print('Testing Accuracy: ', accuracy_score(test_y, qda.predict(test_x)))

Cross-Validation Accuracy: Mean: 0.8993757802746567  STD:  0.0900826709277895
Training Accuracy:  0.9897037037037038
Testing Accuracy:  0.9147352955116356


In [24]:
print('Confusion Matrix (Test):')
print(confusion_matrix(test_y, qda.predict(test_x)))
np.savetxt('qda.csv', confusion_matrix(test_y, qda.predict(test_x)), delimiter=',')

Confusion Matrix (Test):
[[4212   48   98   74   34]
 [   0 4272  128    2    0]
 [ 152    0 4491  136    0]
 [   0  554  374 2951   35]
 [   7   29   46   82 3374]]


## KNN

In [25]:
#Cross-Validation

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 10

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Arrays to store cross-validation accuracy results 
cv_acc = np.zeros((trials*n))

for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()

        knn = KNeighborsClassifier(n_neighbors=1,n_jobs=-1)
        knn.fit(train_x_r, train_y_r)
        cv_acc[k+N*9] = accuracy_score(test_y_r, knn.predict(test_x_r))

In [26]:
knn = KNeighborsClassifier(n_neighbors=1,n_jobs=-1)
knn.fit(train_x, train_y)
print('Cross-Validation Accuracy: Mean:', np.mean(cv_acc,axis=0), ' STD: ',np.std(cv_acc,axis=0))
print('Training Accuracy: ', accuracy_score(train_y, knn.predict(train_x)))
print('Testing Accuracy: ', accuracy_score(test_y, knn.predict(test_x)))

Cross-Validation Accuracy: Mean: 0.7548064918851435  STD:  0.198558947188591
Training Accuracy:  1.0
Testing Accuracy:  0.6406464761363098


In [27]:
print('Confusion Matrix (Test):')
print(confusion_matrix(test_y, knn.predict(test_x)))
np.savetxt('knn.csv', confusion_matrix(test_y, knn.predict(test_x)), delimiter=',')

Confusion Matrix (Test):
[[3273   25    1    3 1164]
 [   5 3462   66    0  869]
 [   1  740 1864  259 1915]
 [   0   20  922 1586 1386]
 [   0   42   20  144 3332]]


## SVM - Linear Kernel

In [28]:
#Cross-Validation

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 10

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Arrays to store cross-validation accuracy results 
cv_acc = np.zeros((trials*n))

for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()

        svc = SVC(kernel='linear', C=0.00029470517025518097)
        svc.fit(train_x_r, train_y_r)
        cv_acc[k+N*9] = accuracy_score(test_y_r, svc.predict(test_x_r))

In [29]:
svc = SVC(kernel='linear', C=0.00029470517025518097)
svc.fit(train_x, train_y)
print('Cross-Validation Accuracy: Mean:', np.mean(cv_acc,axis=0), ' STD: ',np.std(cv_acc,axis=0))
print('Training Accuracy: ', accuracy_score(train_y, svc.predict(train_x)))
print('Testing Accuracy: ', accuracy_score(test_y, svc.predict(test_x)))

Cross-Validation Accuracy: Mean: 0.8818726591760299  STD:  0.1023991133762062
Training Accuracy:  0.9864444444444445
Testing Accuracy:  0.9224607801317598


In [30]:
print('Confusion Matrix (Test):')
print(confusion_matrix(test_y, svc.predict(test_x)))
np.savetxt('svc.csv', confusion_matrix(test_y, svc.predict(test_x)), delimiter=',')

Confusion Matrix (Test):
[[4341   48   64    0   13]
 [  36 4318   31   15    2]
 [  19   45 4427  288    0]
 [   0  907   63 2936    8]
 [   0   61    0   36 3441]]


## Voting Classifier - Four Classifiers Combined

In [31]:
#Cross-Validation

#List of available training users to drop one by one in cross validation
users = [0,1,2,5,6,8,9,10,11]
n = len(users)
trials = 10

user_df = []
for i in range(n):
    user_df.append(train_set.loc[train_set['User'] == users[i]])

#Arrays to store cross-validation accuracy results 
cv_acc = np.zeros((trials*n))

for N in range(trials):
    for k in range(n):
        #Create a shuffled, balanced set from the data
        for j in range(n):
            class1 = user_df[j].loc[user_df[j]['Class'] == 1].sample(n=178)
            class2 = user_df[j].loc[user_df[j]['Class'] == 2].sample(n=178)
            class3 = user_df[j].loc[user_df[j]['Class'] == 3].sample(n=178)
            class4 = user_df[j].loc[user_df[j]['Class'] == 4].sample(n=178)
            class5 = user_df[j].loc[user_df[j]['Class'] == 5].sample(n=178)
            user_df[j] = pd.concat([class1, class2, class3, class4, class5])
    
        balanced_set = pd.concat([user_df[0],user_df[1],user_df[2],user_df[3],user_df[4],\
                                  user_df[5],user_df[6],user_df[7],user_df[8]])

        #Remove one user from the training set and use it for validation, train using the balanced remainder
        train_x_r = balanced_set.loc[balanced_set['User'] != users[k]][final_features].to_numpy()
        train_y_r = balanced_set.loc[balanced_set['User'] != users[k]]['Class'].to_numpy()
        test_x_r = balanced_set.loc[balanced_set['User'] == users[k]][final_features].to_numpy()
        test_y_r = balanced_set.loc[balanced_set['User'] == users[k]]['Class'].to_numpy()

        ensemble = VotingClassifier(estimators=[('svm', svm), ('lda', lda), ('qda', qda)],voting='hard')
        ensemble.fit(train_x_r, train_y_r)
        cv_acc[k+N*9] = accuracy_score(test_y_r, ensemble.predict(test_x_r))

In [32]:
ensemble = VotingClassifier(estimators=[('svm', svm), ('lda', lda), ('qda', qda)],voting='hard')
ensemble.fit(train_x, train_y)
print('Cross-Validation Accuracy: Mean:', np.mean(cv_acc,axis=0), ' STD: ',np.std(cv_acc,axis=0))
print('Training Accuracy: ', accuracy_score(train_y, ensemble.predict(train_x)))
print('Testing Accuracy: ', accuracy_score(test_y, ensemble.predict(test_x)))

Cross-Validation Accuracy: Mean: 0.9158551810237204  STD:  0.07909176762788242
Training Accuracy:  0.9951851851851852
Testing Accuracy:  0.9539788615574197


In [33]:
print('Confusion Matrix (Test):')
print(confusion_matrix(test_y, ensemble.predict(test_x)))
np.savetxt('ensemble.csv', confusion_matrix(test_y, ensemble.predict(test_x)), delimiter=',')

Confusion Matrix (Test):
[[4350   48   49    0   19]
 [  33 4282   84    1    2]
 [  49    0 4651   79    0]
 [   0  163  304 3423   24]
 [   7   59    9   41 3422]]
