## Importing libraries/data

### Importing Libraries

In [1]:
# For general use
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# For our classifiers
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import tree
from sklearn import neural_network

# For confusion matrix/data importing
from sklearn import metrics  
from sklearn.datasets import load_digits  

### Importing Math Symbol data

In [2]:
dfs = []
for cls in ['+', ')', '(', ',', '-']:
    print(f"Loading {cls}.CSV...")
    a = pd.read_csv("CSV/"+cls+".csv", index_col=0)
    a['class'] = cls
    dfs.append(a)

print("Concatenating Data...")
df = pd.concat(dfs,ignore_index=True, sort =False)
print("Done")

Loading +.CSV...
Loading ).CSV...
Loading (.CSV...
Loading ,.CSV...
Loading -.CSV...
Concatenating Data...
Done


### Test/Train split

In [3]:
features = [feature for feature in df]
features.remove("class")

train_X = np.array(df[df.index%2 == 0][features])
train_y = df[df.index%2 == 0]["class"]

test_X = np.array(df[df.index%2 == 1][features])
test_y = df[df.index%2 == 1]["class"]

### Importing Digits

This is where we will import the MNIST data set to classify digits as well (in the future)

## Testing classifiers
I conducted some "accuracy vs parameter" tests and found these to the be the optimal `n_neighbors` and `max_depth` values. In the future I will put those tests here

In [4]:
# Kneighbors
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
print("Fitting...")
clf.fit(train_X, train_y)
print("Scoring...")
clf.score(test_X[:100], test_y[:100])

Fitting...
Scoring...


0.98

In [5]:
# Decision Tree
clf = tree.DecisionTreeClassifier(random_state=10)
print("Fitting...")
clf.fit(train_X, train_y)
print("Scoring...")
clf.score(test_X, test_y)

Fitting...
Scoring...


0.980638829407566

In [6]:
# Gaussian Naive Bayes
clf = naive_bayes.GaussianNB()
print("Fitting...")
clf.fit(train_X, train_y)
print("Scoring...")
clf.score(test_X, test_y)

Fitting...
Scoring...


0.905870806566738

### Finding most accurate hidden_layer_sizes value

In [11]:
# Using a small subset of data for faster results
data_portion = 1000
train_X2 = np.array(df[df.index%data_portion == 0][features])
train_y2 = df[df.index%data_portion == 0]["class"]

test_X2 = np.array(df[df.index%data_portion == 1][features])
test_y2 = df[df.index%data_portion == 1]["class"]

# Recording scores
scoresNN = []
for hid in range(1, 15):
    print(f"fitting and scoring hidden_layer_sizes={hid}...")
    clf = neural_network.MLPClassifier(hidden_layer_sizes=(hid),max_iter=2000, random_state=42)
    clf.fit(train_X2, train_y2)
    scoresNN.append(clf.score(test_X2, test_y2))

fitting and scoring hidden_layer_sizes=1...
fitting and scoring hidden_layer_sizes=2...
fitting and scoring hidden_layer_sizes=3...
fitting and scoring hidden_layer_sizes=4...
fitting and scoring hidden_layer_sizes=5...
fitting and scoring hidden_layer_sizes=6...
fitting and scoring hidden_layer_sizes=7...
fitting and scoring hidden_layer_sizes=8...
fitting and scoring hidden_layer_sizes=9...
fitting and scoring hidden_layer_sizes=10...
fitting and scoring hidden_layer_sizes=11...
fitting and scoring hidden_layer_sizes=12...
fitting and scoring hidden_layer_sizes=13...
fitting and scoring hidden_layer_sizes=14...


In [12]:
for i, score in zip(range(len(scoresNN)), scoresNN):
    print("hidden_layer_sizes =", i+1, '  ::  ', score)

hidden_layer_sizes = 1   ::   0.5222222222222223
hidden_layer_sizes = 2   ::   0.6333333333333333
hidden_layer_sizes = 3   ::   0.6777777777777778
hidden_layer_sizes = 4   ::   0.7
hidden_layer_sizes = 5   ::   0.7111111111111111
hidden_layer_sizes = 6   ::   0.7777777777777778
hidden_layer_sizes = 7   ::   0.8111111111111111
hidden_layer_sizes = 8   ::   0.8333333333333334
hidden_layer_sizes = 9   ::   0.7444444444444445
hidden_layer_sizes = 10   ::   0.7222222222222222
hidden_layer_sizes = 11   ::   0.7555555555555555
hidden_layer_sizes = 12   ::   0.7
hidden_layer_sizes = 13   ::   0.7333333333333333
hidden_layer_sizes = 14   ::   0.7222222222222222


Clearly `hidden_layer_sizes=8` is the most accurate value.

Using this value, I will conduct accuracy testing with all of the data

In [13]:
num = 4
train_X3 = np.array(df[df.index%num != 0][features])
train_y3 = df[df.index%num != 0]["class"]

test_X3 = np.array(df[df.index%num == 0][features])
test_y3 = df[df.index%num == 0]["class"]

clf = neural_network.MLPClassifier(hidden_layer_sizes=(8),max_iter=8000, random_state=42)
print("fitting...")
clf.fit(train_X3, train_y3)
print("scoring...")
clf.score(test_X3, test_y3)

fitting...
scoring...


0.9965203426124197