## Importing libraries/data

### Importing Libraries

In [71]:
# For general use
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# For our classifiers
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import tree
from sklearn import neural_network

# For confusion matrix/data importing
from sklearn import metrics  
from sklearn.datasets import load_digits  

### Importing CSV data

In [81]:
dfs = []
for cls in ['+', ')', '(', ',', '-','0','1','2','3','4','5','6','7','8','9']:
    print(f"Loading {cls}.CSV...")
    a = pd.read_csv("CSV/"+cls+".csv", index_col=0)
    a['class'] = cls
    dfs.append(a)

print("Concatenating Data...")
df = pd.concat(dfs,ignore_index=True, sort=False)
print("Done")

Loading +.CSV...
Loading ).CSV...
Loading (.CSV...
Loading ,.CSV...
Loading -.CSV...
Loading 0.CSV...
Loading 1.CSV...
Loading 2.CSV...
Loading 3.CSV...
Loading 4.CSV...
Loading 5.CSV...
Loading 6.CSV...
Loading 7.CSV...
Loading 8.CSV...
Loading 9.CSV...
Concatenating Data...
Done


In [82]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,+
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,+
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,+
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,+
4,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,0,0,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149659,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
149660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
149661,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
149662,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9


### Test/Train split

In [83]:
features = [feature for feature in df]
features.remove("class")

train_X = np.array(df[df.index%2 == 0][features])
train_y = df[df.index%2 == 0]["class"]

test_X = np.array(df[df.index%2 == 1][features])
test_y = df[df.index%2 == 1]["class"]

## Testing classifiers
I conducted some "accuracy vs parameter" tests and found these to the be the optimal `n_neighbors` and `max_depth` values. In the future I will put those tests here

In [53]:
# Kneighbors
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
print("Fitting...")
clf.fit(train_X, train_y)
print("Scoring...")
clf.score(test_X[:100], test_y[:100])

Fitting...
Scoring...


0.97

In [84]:
# Decision Tree
clf = tree.DecisionTreeClassifier(random_state=10)
print("Fitting...")
clf.fit(train_X, train_y)
print("Scoring...")
clf.score(test_X, test_y)

Fitting...
Scoring...


0.9230810348514005

In [55]:
# Gaussian Naive Bayes
clf = naive_bayes.GaussianNB()
print("Fitting...")
clf.fit(train_X, train_y)
print("Scoring...")
clf.score(test_X, test_y)

Fitting...
Scoring...


0.7919472952747487

### Finding most accurate hidden_layer_sizes value

In [56]:
# Using a small subset of data for faster results
data_portion = 500
train_X2 = np.array(df[df.index%data_portion == 0][features])
train_y2 = df[df.index%data_portion == 0]["class"]

test_X2 = np.array(df[df.index%data_portion == 1][features])
test_y2 = df[df.index%data_portion == 1]["class"]

# Recording scores
scoresNN = []
for hid in range(1, 20):
    print(f"fitting and scoring hidden_layer_sizes={hid}...")
    clf = neural_network.MLPClassifier(hidden_layer_sizes=(hid),max_iter=2000, random_state=42)
    clf.fit(train_X2, train_y2)
    scoresNN.append(clf.score(test_X2, test_y2))

fitting and scoring hidden_layer_sizes=1...
fitting and scoring hidden_layer_sizes=2...
fitting and scoring hidden_layer_sizes=3...
fitting and scoring hidden_layer_sizes=4...
fitting and scoring hidden_layer_sizes=5...
fitting and scoring hidden_layer_sizes=6...
fitting and scoring hidden_layer_sizes=7...
fitting and scoring hidden_layer_sizes=8...
fitting and scoring hidden_layer_sizes=9...
fitting and scoring hidden_layer_sizes=10...
fitting and scoring hidden_layer_sizes=11...
fitting and scoring hidden_layer_sizes=12...
fitting and scoring hidden_layer_sizes=13...
fitting and scoring hidden_layer_sizes=14...
fitting and scoring hidden_layer_sizes=15...
fitting and scoring hidden_layer_sizes=16...
fitting and scoring hidden_layer_sizes=17...
fitting and scoring hidden_layer_sizes=18...
fitting and scoring hidden_layer_sizes=19...


In [57]:
for i, score in zip(range(len(scoresNN)), scoresNN):
    print("hidden_layer_sizes =", i+1, '  ::  ', score)

hidden_layer_sizes = 1   ::   0.29333333333333333
hidden_layer_sizes = 2   ::   0.22333333333333333
hidden_layer_sizes = 3   ::   0.2833333333333333
hidden_layer_sizes = 4   ::   0.3466666666666667
hidden_layer_sizes = 5   ::   0.33666666666666667
hidden_layer_sizes = 6   ::   0.3566666666666667
hidden_layer_sizes = 7   ::   0.31333333333333335
hidden_layer_sizes = 8   ::   0.34
hidden_layer_sizes = 9   ::   0.4
hidden_layer_sizes = 10   ::   0.2966666666666667
hidden_layer_sizes = 11   ::   0.31
hidden_layer_sizes = 12   ::   0.4266666666666667
hidden_layer_sizes = 13   ::   0.4
hidden_layer_sizes = 14   ::   0.45
hidden_layer_sizes = 15   ::   0.38666666666666666
hidden_layer_sizes = 16   ::   0.4866666666666667
hidden_layer_sizes = 17   ::   0.47333333333333333
hidden_layer_sizes = 18   ::   0.45
hidden_layer_sizes = 19   ::   0.4066666666666667


Clearly `hidden_layer_sizes=16` is the most accurate value.

Using this value, I will conduct accuracy testing with all of the data

In [67]:
num = 19
train_X3 = np.array(df[df.index%num != 0][features])
train_y3 = df[df.index%num != 0]["class"]

test_X3 = np.array(df[df.index%num == 0][features])
test_y3 = df[df.index%num == 0]["class"]

clf = neural_network.MLPClassifier(hidden_layer_sizes=(16),max_iter=8000, random_state=42)
print("fitting...")
clf.fit(train_X3, train_y3)
print("scoring...")
clf.score(test_X3, test_y3)

fitting...
scoring...


0.9478293983244478

In [68]:
import pickle
with open('clf.pk', 'wb') as fout:
    pickle.dump(clf, fout)

In [72]:
clf2 = 10
with open('clf.pk', 'rb') as fin:
    clf2 = pickle.load(fin)
clf2.score(test_X, test_y)

0.9667121017746418