In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn import tree
import sklearn.ensemble as e
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import joblib

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
y_train = df_train['label']
X_train = df_train.drop(['label'], axis=1)

subset_x_train, subset_x_test, subset_y_train, subset_y_test = train_test_split(X_train, y_train, test_size=0.2)

In [3]:
#Logistic Reg
with joblib.parallel_backend(backend='loky', n_jobs=14):
    model1 = LogisticRegression(max_iter=500)
    model1.fit(subset_x_train, subset_y_train)
    print('logistic performance: ', model1.score(subset_x_test, subset_y_test))
    print()
    
#around 89% accuracy

In [4]:
#knn - euclidean
with joblib.parallel_backend(backend='loky', n_jobs=14):
    model2_accuracy = []
    model2 = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
    model2.fit(subset_x_train, subset_y_train)
    model2_accuracy.append(model2.score(subset_x_test, subset_y_test))


#.964285 accuracy

In [5]:
print("knn euclid mean Accuracies:", max(model2_accuracy))
print(model2_accuracy)

knn euclid mean Accuracies: 0.9642857142857143
[0.9642857142857143]


In [6]:
model3_accuracy = []
with joblib.parallel_backend(backend='loky', n_jobs=14):
    for i in range(1,11):
        model3 = KNeighborsClassifier(n_neighbors=i, metric='manhattan')
        model3.fit(subset_x_train, subset_y_train)
        model3_accuracy.append(model3.score(subset_x_test, subset_y_test))

# #accuracy = .9578571428571429

In [7]:
print("knn manhattan mean Accuracies:", max(model3_accuracy))
print(model3_accuracy)

knn manhattan mean Accuracies: 0.9578571428571429
[0.9578571428571429, 0.9469047619047619, 0.9561904761904761, 0.955, 0.9552380952380952, 0.9553571428571429, 0.9542857142857143, 0.9521428571428572, 0.9519047619047619, 0.9519047619047619]


In [10]:
model5_accuracy = []
with joblib.parallel_backend(backend='loky', n_jobs=14):
    for i in range(1,9):
        model5 = KNeighborsClassifier(n_neighbors=i, metric='chebyshev')
        model5.fit(subset_x_train, subset_y_train)
        model5_accuracy.append(model3.score(subset_x_test, subset_y_test))

#accuracy: 0.9519047619047619

In [11]:
print("knn chebyshev mean Accuracies:", max(model5_accuracy))
print(model5_accuracy)

knn chebyshev mean Accuracies: 0.9519047619047619
[0.9519047619047619, 0.9519047619047619, 0.9519047619047619, 0.9519047619047619, 0.9519047619047619, 0.9519047619047619, 0.9519047619047619, 0.9519047619047619, 0.9519047619047619, 0.9519047619047619]


In [13]:
#NB - did not work
model1 = CategoricalNB()
model1.fit(subset_x_train, subset_y_train)
print('Naive bayes performance: ', model1.score(subset_x_test, subset_y_test))

In [30]:
#tree
model1_accuracy = []
with joblib.parallel_backend(backend='loky', n_jobs=14):
    for depth in range(25,50):
        d_tree = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = depth)
        m = d_tree.fit(subset_x_train, subset_y_train)
        model1_accuracy.append(m.score(subset_x_test, subset_y_test))
    

In [31]:
print("tree Accuracies:", max(model1_accuracy))
print(model1_accuracy)

tree Accuracies: 0.8676190476190476
[0.863452380952381, 0.8676190476190476, 0.8628571428571429, 0.8598809523809524, 0.8633333333333333, 0.8636904761904762, 0.8629761904761905, 0.8629761904761905, 0.8652380952380953, 0.8622619047619048, 0.8633333333333333, 0.8626190476190476, 0.8635714285714285, 0.8582142857142857, 0.8647619047619047, 0.8623809523809524, 0.8617857142857143, 0.863452380952381, 0.8605952380952381, 0.865, 0.8613095238095239, 0.8630952380952381, 0.8663095238095239, 0.8588095238095238, 0.8614285714285714]


In [24]:
#forest
model1_accuracy = []
with joblib.parallel_backend(backend='loky', n_jobs=14):
    for trees in range(30,71, 5):
        d_tree = e.RandomForestClassifier(criterion = 'entropy', n_estimators = trees)
        m = d_tree.fit(subset_x_train, subset_y_train)
        model1_accuracy.append(m.score(subset_x_test, subset_y_test))

#accuracy 0.9628571428571429

In [25]:
print("forest Accuracies:", max(model1_accuracy))
print(model1_accuracy)

forest Accuracies: 0.9628571428571429
[0.9561904761904761, 0.9583333333333334, 0.9565476190476191, 0.9592857142857143, 0.9595238095238096, 0.9583333333333334, 0.9596428571428571, 0.9628571428571429, 0.9602380952380952]


In [3]:
#svm

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
test_accuracy = []

with joblib.parallel_backend(backend='loky', n_jobs=14):
    for kernel in kernels:
        model = svm.SVC(kernel=kernel, max_iter=8000)
        model.fit(subset_x_train, subset_y_train)
        accuracy = model.score(subset_x_test, subset_y_test)
        test_accuracy.append((accuracy, kernel))

#SVM accuracy: .9726190476190476 -RBF



In [4]:
print("svm Accuracies:", max(test_accuracy))
print(test_accuracy)

svm Accuracies: (0.9726190476190476, 'rbf')
[(0.8845238095238095, 'linear'), (0.9709523809523809, 'poly'), (0.9726190476190476, 'rbf'), (0.7785714285714286, 'sigmoid')]


In [27]:
#NN
test_acc = []
with joblib.parallel_backend(backend='loky', n_jobs=14):
    model = MLPClassifier(max_iter=500)
    model.fit(subset_x_train, subset_y_train)
    test_acc.append(model.score(subset_x_test, subset_y_test))

#unarchitected NN .96 accuracy

In [28]:
print("NN Accuracies:", max(test_acc))
print(test_acc)

NN Accuracies: 0.96
[0.96]


In [None]:
test_accuracy = []
with joblib.parallel_backend(backend='loky', n_jobs=14):
    for i in range(2,7):
        for j in range(2,7):
            for k in range(2,7):
                model = MLPClassifier(hidden_layer_sizes=(i,j,k), max_iter=2000)
                model.fit(subset_x_train, subset_y_train)
                test = ((model.score(subset_x_test, subset_y_test)), (i,j,k))
                test_accuracy.append(test)

In [None]:
print(max(test_accuracy))