## ML Final Group Project - kNN & SVM

In [22]:
import json, random, sys
import numpy as np
import getopt
import pandas as pd
import math
import time
from sklearn import metrics
import sklearn
import plotly
from plotly.subplots import make_subplots
import itertools
import plotly.graph_objects as go
import math
from sklearn.decomposition import PCA
from statistics import mean
from sklearn import svm
from random import shuffle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import load_ship_data
import knn

np.random.seed(522)
Training_data, Testing_data, Validation_data = load_ship_data.load_data_train_test_split(fileName = "./data/shipsnet.json")

Xtrain = Training_data[0]
ytrain = Training_data[1]
Xtest = Testing_data[0]
ytest = Testing_data[1]
Xval = Validation_data[0]
yval = Validation_data[1]
Xtrain = Xtrain.reshape(Xtrain.shape[0],Xtrain.shape[1]*Xtrain.shape[2]*Xtrain.shape[3])
Xval = Xval.reshape(Xval.shape[0],Xval.shape[1]*Xval.shape[2]*Xval.shape[3])
Xtest = Xtest.reshape(Xtest.shape[0],Xtest.shape[1]*Xtest.shape[2]*Xtest.shape[3])
Training_data = pd.DataFrame(Xtrain)
Training_data["Y"] = ytrain
Validation_data = pd.DataFrame(Xval)
Validation_data["Y"] = yval
Testing_data = pd.DataFrame(Xtest)
Testing_data["Y"] = ytest
Training_data = pd.concat([Training_data,Validation_data])
ytrain = np.concatenate([ytrain,yval])

Training Set Data Length:  2800   Label Length:  2800
TestingSet Set Data Length:  600  Label Length:  600
Validation Set Data Length:  600  Label Length:  600


In [6]:
# kNN (without PCA): cross validation
k = [5,10,25,50,75,100]
KNN_models = []
for i in range(len(k)):
    print("Running KNN with k = " + str(k[i]))
    fold = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    for train_index, test_index in skf.split(Training_data.drop(['Y'],axis = 1),Training_data['Y']):
        Training_data_ = Training_data.iloc[train_index, :]
        Testing_data_ = Training_data.iloc[test_index, :]
        Results = knn.KNN(
            Training_data = Training_data_,
            Testing_data = Testing_data_,
            class_var = "Y",
            num_neighb = int(k[i])
        )
        Metrics = sklearn.metrics.classification_report(Testing_data_['Y'],Results[1],output_dict = True)
        Metrics = [Results[0],Metrics['accuracy'],Metrics['0']['precision'],Metrics['0']['recall'],Metrics['1']['precision'],Metrics['1']['recall']]
        fold.append(Metrics)
    KNN_models.append(fold)
    
plot_data = pd.DataFrame()
for d in range(len(k)):
    Data_results = pd.DataFrame(KNN_models[d])
    Data_results["K"] = k[d]
    Data_results["Fold"] = list(range(5))
    plot_data = pd.concat([plot_data,Data_results])
    
plot_data = plot_data.groupby(['K']).mean()
K = ["k: 5","k: 10","k: 25","k: 50","k: 75","k: 100","k: 500","k: 1000","k: 2500"]

fig = make_subplots(rows = 1,cols = 2,column_widths = [0.1, 0.9])
fig.add_trace(go.Bar(x = K,y = plot_data[0],showlegend = False),
    row = 1,col = 1)
fig.add_trace(go.Scatter(x = k,y = plot_data[1],showlegend = True,name = "overall accuracy"),
    row = 1,col = 2)
fig.add_trace(go.Scatter(x = k,y = plot_data[2],showlegend = True,name = "class zero accuracy"),
    row = 1,col = 2)
fig.add_trace(go.Scatter(x = k,y = plot_data[4],showlegend = True,name = "class one accuracy"),
    row = 1,col = 2)
fig.update_xaxes(title_text = 'K',row = 1,col = 2)
fig.update_yaxes(title_text = 'Accuracy',row = 1,col = 2)
fig.update_xaxes(title_text = '',row = 1,col = 1,titlefont = dict(size = 12),tickfont=dict(size=11))
fig.update_yaxes(title_text = 'Time (seconds)',row = 1,col = 1,tickfont=dict(size=10),titlefont = dict(size = 12))
fig.update_layout(legend = dict(orientation="h",yanchor="bottom",y=1.02,xanchor="right",x=1))
fig.show()

Running KNN with k = 5
Running KNN with k = 10
Running KNN with k = 25
Running KNN with k = 50
Running KNN with k = 75
Running KNN with k = 100


In [7]:
# kNN (without PCA): final model
Final_model = knn.KNN(
    Training_data = Training_data,
    Testing_data = Testing_data,
    class_var = "Y",
    num_neighb = 5
)
print(sklearn.metrics.classification_report(Testing_data["Y"],Final_model[1]))

              precision    recall  f1-score   support

           0       0.82      0.96      0.89       450
           1       0.75      0.39      0.51       150

    accuracy                           0.81       600
   macro avg       0.79      0.67      0.70       600
weighted avg       0.81      0.81      0.79       600



In [9]:
# kNN (with PCA): cross validation
pca = PCA(n_components = 0.99,svd_solver = 'full')
Training_data_PCA = pd.DataFrame(pca.fit_transform(Training_data.drop(['Y'],1)))
Testing_data_PCA = pd.DataFrame(pca.transform(Testing_data.drop(['Y'],1)))
Training_data_PCA["Y"] = ytrain
Testing_data_PCA["Y"] = ytest

k = [5,10,25,50,75,100,200,500]
KNN_models_PCA = []
for i in range(len(k)):
    print("Running KNN with k = " + str(k[i]))
    fold = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
    for train_index, test_index in skf.split(Training_data_PCA.drop(['Y'],axis = 1),Training_data_PCA['Y']):
        Training_data_PCA_ = Training_data_PCA.iloc[train_index, :]
        Testing_data_PCA_ = Training_data_PCA.iloc[test_index, :]
        Results = knn.KNN(
            Training_data = Training_data_PCA_,
            Testing_data = Testing_data_PCA_,
            class_var = "Y",
            num_neighb = int(k[i])
        )
        Metrics = sklearn.metrics.classification_report(Testing_data_PCA_['Y'],Results[1],output_dict = True)
        Metrics = [Results[0],Metrics['accuracy'],Metrics['0']['precision'],Metrics['0']['recall'],Metrics['1']['precision'],Metrics['1']['recall']]
        fold.append(Metrics)
    KNN_models_PCA.append(fold)
    
plot_data = pd.DataFrame()
for d in range(8):
    Data_results = pd.DataFrame(KNN_models_PCA[d])
    Data_results["K"] = k[d]
    Data_results["Fold"] = list(range(5))
    plot_data = pd.concat([plot_data,Data_results])
    
plot_data = plot_data.groupby(['K']).mean()
K = ["k: 5","k: 10","k: 25","k: 50","k: 75","k: 100","k: 200","k: 500"]

fig = make_subplots(rows = 1,cols = 2,column_widths = [0.1, 0.9])
fig.add_trace(go.Bar(x = K,y = plot_data[0],showlegend = False),
    row = 1,col = 1)
fig.add_trace(go.Scatter(x = k,y = plot_data[1],showlegend = True,name = "overall accuracy"),
    row = 1,col = 2)
fig.add_trace(go.Scatter(x = k,y = plot_data[2],showlegend = True,name = "class zero accuracy"),
    row = 1,col = 2)
fig.add_trace(go.Scatter(x = k,y = plot_data[4],showlegend = True,name = "class one accuracy"),
    row = 1,col = 2)
fig.update_xaxes(title_text = 'K',row = 1,col = 2)
fig.update_yaxes(title_text = 'Accuracy',row = 1,col = 2)
fig.update_xaxes(title_text = '',row = 1,col = 1,titlefont = dict(size = 12),tickfont=dict(size=11))
fig.update_yaxes(title_text = 'Time (seconds)',row = 1,col = 1,tickfont=dict(size=10),titlefont = dict(size = 12))
fig.update_layout(legend = dict(orientation="h",yanchor="bottom",y=1.02,xanchor="right",x=1))
fig.show()

Running KNN with k = 5
Running KNN with k = 10
Running KNN with k = 25
Running KNN with k = 50
Running KNN with k = 75
Running KNN with k = 100
Running KNN with k = 200
Running KNN with k = 500


In [10]:
# kNN (without PCA): final model
Final_model = knn.KNN(
    Training_data = Training_data_PCA,
    Testing_data = Testing_data_PCA,
    class_var = "Y",
    num_neighb = 10
)
print(sklearn.metrics.classification_report(Testing_data_PCA["Y"],Final_model[1]))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94       450
           1       0.83      0.84      0.83       150

    accuracy                           0.92       600
   macro avg       0.89      0.89      0.89       600
weighted avg       0.92      0.92      0.92       600



In [11]:
# SVM (without PCA): cross validation linear, rbf, and polynomial kernel
C = [0.001,0.01,0.1,1,10,100,1000]
SVM_models = []
for i in range(len(C)):
    print("Running linear SVM with C = " + str(C[i]))
    fold = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
    for train_index, test_index in skf.split(Training_data.drop(['Y'],axis = 1),Training_data['Y']):
        Training_data_ = Training_data.iloc[train_index, :]
        Testing_data_ = Training_data.iloc[test_index, :]
        start = time.time()
        clf = svm.SVC(kernel = "linear",C = C[i])
        clf.fit(Training_data_.drop('Y',1).values,Training_data_['Y'])
        Time = time.time() - start
        predictions = clf.predict(Testing_data_.drop('Y',1).values)
        Metrics = sklearn.metrics.classification_report(Testing_data_['Y'],predictions,output_dict = True)
        Metrics = [Time,Metrics['accuracy'],Metrics['0']['precision'],Metrics['0']['recall'],Metrics['1']['precision'],Metrics['1']['recall']]
        fold.append(Metrics)
    SVM_models.append(fold)
    
Data_results_all = pd.DataFrame()
for d in range(len(C)):
    Data_results = pd.DataFrame(SVM_models[d])
    Data_results["C"] = C[d]
    Data_results["Fold"] = list(range(5))
    Data_results_all = pd.concat([Data_results_all,Data_results])
    
Results = Data_results_all.groupby('C').mean()
Results.columns = ["Runtime","Accuracy","Precision0","Recall0","Precision1","Recall1","Fold"]
print(Results)

C = [0.001,0.01,0.1,1,10,100,1000]
SVM_models_rbf = []
for i in range(len(C)):
    print("Running rbf SVM with C = " + str(C[i]))
    fold = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
    for train_index, test_index in skf.split(Training_data.drop(['Y'],axis = 1),Training_data['Y']):
        Training_data_ = Training_data.iloc[train_index, :]
        Testing_data_ = Training_data.iloc[test_index, :]
        start = time.time()
        clf = svm.SVC(kernel = "rbf",C = C[i])
        clf.fit(Training_data_.drop('Y',1).values,Training_data_['Y'])
        Time = time.time() - start
        predictions = clf.predict(Testing_data_.drop('Y',1).values)
        Metrics = sklearn.metrics.classification_report(Testing_data_['Y'],predictions,output_dict = True)
        Metrics = [Time,Metrics['accuracy'],Metrics['0']['precision'],Metrics['0']['recall'],Metrics['1']['precision'],Metrics['1']['recall']]
        fold.append(Metrics)
    SVM_models_rbf.append(fold)
    
Data_results_all = pd.DataFrame()
for d in range(len(C)):
    Data_results = pd.DataFrame(SVM_models_rbf[d])
    Data_results["C"] = C[d]
    Data_results["Fold"] = list(range(5))
    Data_results_all = pd.concat([Data_results_all,Data_results])
    
Results = Data_results_all.groupby('C').mean()
Results.columns = ["Runtime","Accuracy","Precision0","Recall0","Precision1","Recall1","Fold"]
print(Results)

C = [0.001,0.01,0.1,1,10,100,1000]
SVM_models_poly = []
for i in range(len(C)):
    print("Running poly SVM with C = " + str(C[i]))
    fold = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
    for train_index, test_index in skf.split(Training_data.drop(['Y'],axis = 1),Training_data['Y']):
        Training_data_ = Training_data.iloc[train_index, :]
        Testing_data_ = Training_data.iloc[test_index, :]
        start = time.time()
        clf = svm.SVC(kernel = "poly",C = C[i])
        clf.fit(Training_data_.drop('Y',1).values,Training_data_['Y'])
        Time = time.time() - start
        predictions = clf.predict(Testing_data_.drop('Y',1).values)
        Metrics = sklearn.metrics.classification_report(Testing_data_['Y'],predictions,output_dict = True)
        Metrics = [Time,Metrics['accuracy'],Metrics['0']['precision'],Metrics['0']['recall'],Metrics['1']['precision'],Metrics['1']['recall']]
        fold.append(Metrics)
    SVM_models_poly.append(fold)
    
Data_results_all = pd.DataFrame()
for d in range(len(C)):
    Data_results = pd.DataFrame(SVM_models_poly[d])
    Data_results["C"] = C[d]
    Data_results["Fold"] = list(range(5))
    Data_results_all = pd.concat([Data_results_all,Data_results])
    
Results = Data_results_all.groupby('C').mean()
Results.columns = ["Runtime","Accuracy","Precision0","Recall0","Precision1","Recall1","Fold"]
print(Results)

# SVM (without PCA): final model with rbf kernel
clf = svm.SVC(kernel = "rbf",C = 10)
clf.fit(Training_data.drop('Y',1).values,Training_data['Y'])
predictions = clf.predict(Testing_data.drop('Y',1).values)
print(sklearn.metrics.classification_report(Testing_data['Y'],predictions))

Running linear SVM with C = 0.001
Running linear SVM with C = 0.01
Running linear SVM with C = 0.1
Running linear SVM with C = 1
Running linear SVM with C = 10
Running linear SVM with C = 100
Running linear SVM with C = 1000
            Runtime  Accuracy  Precision0   Recall0  Precision1   Recall1  \
C                                                                           
0.001     49.392094  0.909412    0.956596  0.921176    0.787698  0.874118   
0.010     47.075429  0.909412    0.956596  0.921176    0.787698  0.874118   
0.100     47.255470  0.909412    0.956596  0.921176    0.787698  0.874118   
1.000     47.066771  0.909412    0.956596  0.921176    0.787698  0.874118   
10.000    47.080490  0.909412    0.956596  0.921176    0.787698  0.874118   
100.000   47.180123  0.909412    0.956596  0.921176    0.787698  0.874118   
1000.000  47.011880  0.909412    0.956596  0.921176    0.787698  0.874118   

          Fold  
C               
0.001        2  
0.010        2  
0.100        


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Running linear SVM with C = 0.01



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Running linear SVM with C = 0.1
Running linear SVM with C = 1
Running linear SVM with C = 10
Running linear SVM with C = 100
Running linear SVM with C = 1000
            Runtime  Accuracy  Precision0   Recall0  Precision1   Recall1  \
C                                                                           
0.001     81.007202  0.750000    0.750000  1.000000    0.000000  0.000000   
0.010     80.881238  0.750000    0.750000  1.000000    0.000000  0.000000   
0.100     67.546969  0.895588    0.886236  0.987843    0.945268  0.618824   
1.000     46.095645  0.963824    0.968748  0.983529    0.948678  0.904706   
10.000    38.884915  0.976765    0.983615  0.985490    0.956461  0.950588   
100.000   37.948177  0.976471    0.983609  0.985098    0.955243  0.950588   
1000.000  39.311203  0.976471    0.983609  0.985098    0.955243  0.950588   

          Fold  
C               
0.001        2  
0.010        2  
0.100        2  
1.000        2  
10.000       2  
100.000      2  
1000.000    

In [12]:
# SVM (with PCA): cross validation linear, rbf, and polynomial kernel
C = [0.001,0.01,0.1,1,10,100,1000]
SVM_PCA_models = []
for i in range(len(C)):
    print("Running linear SVM with C = " + str(C[i]))
    fold = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
    for train_index, test_index in skf.split(Training_data_PCA.drop(['Y'],axis = 1),Training_data_PCA['Y']):
        Training_data_PCA_ = Training_data_PCA.iloc[train_index, :]
        Testing_data_PCA_ = Training_data_PCA.iloc[test_index, :]
        start = time.time()
        clf = svm.SVC(kernel = "linear",C = C[i])
        clf.fit(Training_data_PCA_.drop('Y',1).values,Training_data_PCA_['Y'])
        Time = time.time() - start
        predictions = clf.predict(Testing_data_PCA_.drop('Y',1).values)
        Metrics = sklearn.metrics.classification_report(Testing_data_PCA_['Y'],predictions,output_dict = True)
        Metrics = [Time,Metrics['accuracy'],Metrics['0']['precision'],Metrics['0']['recall'],Metrics['1']['precision'],Metrics['1']['recall']]
        fold.append(Metrics)
    SVM_PCA_models.append(fold)
    
Data_results_all = pd.DataFrame()
for d in range(7):
    Data_results = pd.DataFrame(SVM_PCA_models[d])
    Data_results["C"] = C[d]
    Data_results["Fold"] = list(range(5))
    Data_results_all = pd.concat([Data_results_all,Data_results])

Results = Data_results_all.groupby('C').mean()
Results.columns = ["Runtime","Accuracy","Precision0","Recall0","Precision1","Recall1","Fold"]
print(Results)

C = [0.001,0.01,0.1,1,10,100,1000]
SVM_PCA_rbf_models = []
for i in range(len(C)):
    print("Running linear SVM with C = " + str(C[i]))
    fold = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
    for train_index, test_index in skf.split(Training_data_PCA.drop(['Y'],axis = 1),Training_data_PCA['Y']):
        Training_data_PCA_ = Training_data_PCA.iloc[train_index, :]
        Testing_data_PCA_ = Training_data_PCA.iloc[test_index, :]
        start = time.time()
        clf = svm.SVC(kernel = "rbf",C = C[i])
        clf.fit(Training_data_PCA_.drop('Y',1).values,Training_data_PCA_['Y'])
        Time = time.time() - start
        predictions = clf.predict(Testing_data_PCA_.drop('Y',1).values)
        Metrics = sklearn.metrics.classification_report(Testing_data_PCA_['Y'],predictions,output_dict = True)
        Metrics = [Time,Metrics['accuracy'],Metrics['0']['precision'],Metrics['0']['recall'],Metrics['1']['precision'],Metrics['1']['recall']]
        fold.append(Metrics)
    SVM_PCA_rbf_models.append(fold)
    
Data_results_all = pd.DataFrame()
for d in range(7):
    Data_results = pd.DataFrame(SVM_PCA_rbf_models[d])
    Data_results["C"] = C[d]
    Data_results["Fold"] = list(range(5))
    Data_results_all = pd.concat([Data_results_all,Data_results])

Results = Data_results_all.groupby('C').mean()
Results.columns = ["Runtime","Accuracy","Precision0","Recall0","Precision1","Recall1","Fold"]
print(Results)

C = [0.001,0.01,0.1,1,10,100,1000]
SVM_PCA_poly_models = []
for i in range(len(C)):
    print("Running linear SVM with C = " + str(C[i]))
    fold = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
    for train_index, test_index in skf.split(Training_data_PCA.drop(['Y'],axis = 1),Training_data_PCA['Y']):
        Training_data_PCA_ = Training_data_PCA.iloc[train_index, :]
        Testing_data_PCA_ = Training_data_PCA.iloc[test_index, :]
        start = time.time()
        clf = svm.SVC(kernel = "poly",C = C[i])
        clf.fit(Training_data_PCA_.drop('Y',1).values,Training_data_PCA_['Y'])
        Time = time.time() - start
        predictions = clf.predict(Testing_data_PCA_.drop('Y',1).values)
        Metrics = sklearn.metrics.classification_report(Testing_data_PCA_['Y'],predictions,output_dict = True)
        Metrics = [Time,Metrics['accuracy'],Metrics['0']['precision'],Metrics['0']['recall'],Metrics['1']['precision'],Metrics['1']['recall']]
        fold.append(Metrics)
    SVM_PCA_poly_models.append(fold)
    
Data_results_all = pd.DataFrame()
for d in range(7):
    Data_results = pd.DataFrame(SVM_PCA_poly_models[d])
    Data_results["C"] = C[d]
    Data_results["Fold"] = list(range(5))
    Data_results_all = pd.concat([Data_results_all,Data_results])

Results = Data_results_all.groupby('C').mean()
Results.columns = ["Runtime","Accuracy","Precision0","Recall0","Precision1","Recall1","Fold"]
print(Results)

# SVM (with PCA): final model with rbf kernel
clf = svm.SVC(kernel = "rbf",C = 10)
clf.fit(Training_data_PCA.drop('Y',1).values,Training_data_PCA['Y'])
predictions = clf.predict(Testing_data_PCA.drop('Y',1).values)
print(sklearn.metrics.classification_report(Testing_data_PCA['Y'],predictions))

Running linear SVM with C = 0.001
Running linear SVM with C = 0.01
Running linear SVM with C = 0.1
Running linear SVM with C = 1
Running linear SVM with C = 10
Running linear SVM with C = 100
Running linear SVM with C = 1000
           Runtime  Accuracy  Precision0  Recall0  Precision1   Recall1  Fold
C                                                                            
0.001     2.650558     0.885    0.949341  0.89451    0.730828  0.856471     2
0.010     2.592325     0.885    0.949341  0.89451    0.730828  0.856471     2
0.100     2.598834     0.885    0.949341  0.89451    0.730828  0.856471     2
1.000     2.592151     0.885    0.949341  0.89451    0.730828  0.856471     2
10.000    2.596436     0.885    0.949341  0.89451    0.730828  0.856471     2
100.000   2.588866     0.885    0.949341  0.89451    0.730828  0.856471     2
1000.000  2.601414     0.885    0.949341  0.89451    0.730828  0.856471     2
Running linear SVM with C = 0.001



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Running linear SVM with C = 0.01



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Running linear SVM with C = 0.1
Running linear SVM with C = 1
Running linear SVM with C = 10
Running linear SVM with C = 100
Running linear SVM with C = 1000
           Runtime  Accuracy  Precision0   Recall0  Precision1   Recall1  Fold
C                                                                             
0.001     3.875667  0.750000    0.750000  1.000000    0.000000  0.000000     2
0.010     3.934773  0.750000    0.750000  1.000000    0.000000  0.000000     2
0.100     3.051226  0.897353    0.887838  0.988235    0.947453  0.624706     2
1.000     2.084652  0.965882    0.970989  0.983922    0.950127  0.911765     2
10.000    1.662820  0.976471    0.983604  0.985098    0.955443  0.950588     2
100.000   1.630877  0.974118    0.980921  0.984706    0.953778  0.942353     2
1000.000  1.625736  0.974118    0.980921  0.984706    0.953778  0.942353     2
Running linear SVM with C = 0.001



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Running linear SVM with C = 0.01
Running linear SVM with C = 0.1
Running linear SVM with C = 1
Running linear SVM with C = 10
Running linear SVM with C = 100
Running linear SVM with C = 1000
           Runtime  Accuracy  Precision0   Recall0  Precision1   Recall1  Fold
C                                                                             
0.001     3.780846  0.750000    0.750000  1.000000    0.000000  0.000000     2
0.010     3.804873  0.768824    0.764398  1.000000    1.000000  0.075294     2
0.100     3.655758  0.813529    0.803972  0.993725    0.935419  0.272941     2
1.000     3.484926  0.873529    0.867110  0.981961    0.910727  0.548235     2
10.000    3.365449  0.911471    0.922611  0.962745    0.871630  0.757647     2
100.000   3.239816  0.928235    0.958328  0.945490    0.843382  0.876471     2
1000.000  3.124904  0.917941    0.962259  0.927059    0.803402  0.890588     2
              precision    recall  f1-score   support

           0       0.98      0.98      0.98

K-fold cross validation was used to train the k-nearest neighbors algorithm. The number of neighbors, k, was varied from five to one hundred. The optimal choice for the number of neighbors was ten in which the class one accuracy was maximized at 0.72 on the cross validation set and the overall accuracy was 0.79. Applying this model on the testing dataset gave an overall accuracy of 0.79, however, the class one accuracy decreased to 0.68. Principal component analysis was utilized before re-training the k-nearest neighbor algorithm to try and reduce noise within the dataset whil preserving 99% of the original variability. This resulted in selecting 833 prinicipal components. The optimal number of neighbors was again equal to ten. After reducing the dimensionality of the dataset, the cross validation overall accuracy increased to 0.93 and the class one accuracy increased to 0.85. When applying the model to the testing dataset, the overall accuracies and class one accuracies remained at 0.93 and 0.85, respectively. 

The support vector machine (SVM) with a linear, radial basis function, and third degree polynomial kernel were trained both without and with PCA. The best classifier, without dimensionality reduction, was the SVM with a radial basis function kernel. The regularization parameter, C, was tuned using cross validation which dictates the width of the margin. For the radial basis function kernel the optimal value of C was ten which resulted in an overall accuracy of 0.976 on the cross validation dataset and 0.98 for the overall accuracy on the test dataset. When applied to the testing datset this model retained 0.98 overall accuracy. The best classifier, with dimensionality reduction, was the SVM again with a radial basis function kernel. The optimal value for C was one hundred which obtained 0.97 on the overall accuracy for the cross validation set and 0.98 for the overall accuracy on the test dataset. 