In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score

from ID3 import ID3
from Neural_Network import Network
from Naive_Bayes import NaiveBayesClassifier

In [2]:
def convert_to_one_hot(matrix):
    one_hot = None
    df = pd.DataFrame(matrix)
    
    for col in df:
        temp = pd.get_dummies(df[col]).as_matrix()
        if one_hot is None:
            one_hot = temp
        one_hot = np.concatenate((one_hot,temp),axis=1)

    return one_hot

In [3]:
df = pd.read_csv('agaricus-lepiota.data.csv', header=None)

labels = df.pop(0).values
features = df.values

features_one_hot = convert_to_one_hot(features)
lookupTable, indexed_labels = np.unique(labels, return_inverse=True)

In [4]:
id3 = ID3()
naive_bayes = NaiveBayesClassifier()

# input layer is 123 neurons from one hot encoding the 22 mushroom features
# for now when we use relu our accuracy is very bad,
# we believe something is dying somewhere. We couldn't find the bug after we refactored our NN's code
neuralnet = Network(layer_structures=[123, 15, 2], iterations=500)

labels_true = []
id3_predicted = []
naive_bayes_predicted = []
neuralnet_predicted = []

In [5]:
bar = tqdm(total=10)
kf = KFold(n_splits=10, shuffle=True)

for train_index, test_index in kf.split(features):
    features_train, features_test = features[train_index], features[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    features_train_one_hot, features_test_one_hot  = features_one_hot[train_index], features_one_hot[test_index]
    
    
    # save all the of test labels
    labels_true.extend(labels_test)
    
    # convert labels to one hots for the neural network
    labels_train_one_hot = pd.get_dummies(labels_train).as_matrix()  
    
    # Train models
    # note that fit() deletes prior training and starts fresh
    id3.fit(features_train, labels_train)
    naive_bayes.fit(features_train, labels_train)
    neuralnet.fit(features_train_one_hot, labels_train_one_hot)
        
        
    # get predictions
    id3_predicted.extend(id3.predict(features_test))
    naive_bayes_predicted.extend(naive_bayes.predict(features_test))
    neuralnet_predicted.extend(neuralnet.predict(features_test_one_hot, classification=True))
    
    bar.update()



bar.close() 

100%|██████████| 10/10 [02:25<00:00, 14.67s/it]


In [6]:
for i in range(len(neuralnet_predicted)):
    neuralnet_predicted[i] = lookupTable[neuralnet_predicted[i]]

print('\n\nOur ID3 Decision Tree:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, id3_predicted)))
print(classification_report(labels_true, id3_predicted))

print('\n\nOur Naive Bayes:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, naive_bayes_predicted)))
print(classification_report(labels_true, naive_bayes_predicted))

print('\n\nOur Neural Network:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, neuralnet_predicted)))
print(classification_report(labels_true, neuralnet_predicted))



Our ID3 Decision Tree:
Accuracy: 1.00
             precision    recall  f1-score   support

          e       1.00      1.00      1.00      4208
          p       1.00      1.00      1.00      3916

avg / total       1.00      1.00      1.00      8124



Our Naive Bayes:
Accuracy: 0.95
             precision    recall  f1-score   support

          e       0.92      1.00      0.96      4208
          p       0.99      0.91      0.95      3916

avg / total       0.96      0.95      0.95      8124



Our Neural Network:
Accuracy: 1.00
             precision    recall  f1-score   support

          e       1.00      1.00      1.00      4208
          p       1.00      1.00      1.00      3916

avg / total       1.00      1.00      1.00      8124

