In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import KFold
from tqdm import tqdm

from ID3 import ID3
from Neural_Network import Network
from Naive_Bayes import NaiveBayesClassifier

In [2]:
#### make_classification notes:
# n_samples : default=100
# n_features : default=20

### n_classes * n_clusters_per_class must be smaller or equal 2 ** n_informative
# n_informative : default=2
# n_classes : default=2
# n_clusters_per_class : default=2

# n_redundant : default=2
# n_repeated : default=0

# A higher class_sep makes the classification task easier
# class_sep : default=1.0



# to decrease training time I lowered the samples to 1000
features, labels = make_classification(n_samples=1000, n_classes=3, n_informative=20,
                                       n_features=30, class_sep=1.5)



In [3]:
kf = KFold(n_splits=10, shuffle=True)


sk_dtc = DecisionTreeClassifier(criterion='entropy')
sk_gausNB = GaussianNB()
sk_mlpc = MLPClassifier(activation='logistic'
                        , hidden_layer_sizes=(15, 15)
                        , learning_rate='constant'
                        , learning_rate_init=0.001
                        , max_iter=5000)

id3 = ID3()
naive_bayes = NaiveBayesClassifier()

# 30 feature neurons, 2 hidden layers with 15 neurons each, 3 output neurons
# for now when we use relu our accuracy is very bad,
# we believe something is dying somewhere. We couldn't find the bug after we refactored our NN's code
neuralnet = Network(layer_structures=[30, 15, 15, 3], iterations=5000)

labels_true = []

sk_dtc_predicted = []
sk_gausNB_predicted = []
sk_mlpc_predicted = []

id3_predicted = []
naive_bayes_predicted = []
neuralnet_predicted = []


In [4]:
# train with numerical data

bar = tqdm(total=10)

# this loop may take a little bit, It's training 6 models 10 times...
# for speeds shake I lowered the number of samples to 1000, takes about 4 minutes
# tqdm will give you the average time for each loop
for train_index, test_index in kf.split(features):
    features_train, features_test = features[train_index], features[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    
    # save all the of test labels
    labels_true.extend(labels_test)
    
    # convert labels to one hots for the neural network
    # features are numeric, so they don't have to be converted
    labels_train_one_hot = pd.get_dummies(labels_train).as_matrix()
    
    # train models
    # note that fit() deletes prior training and starts fresh
    sk_dtc.fit(features_train, labels_train)
    sk_gausNB.fit(features_train, labels_train)
    sk_mlpc.fit(features_train, labels_train)
    
    id3.fit(features_train, labels_train)
    naive_bayes.fit(features_train, labels_train)
    neuralnet.fit(features_train, labels_train_one_hot)
    
        
    # get predictions
    sk_dtc_predicted.extend(sk_dtc.predict(features_test))
    sk_gausNB_predicted.extend(sk_gausNB.predict(features_test))
    sk_mlpc_predicted.extend(sk_mlpc.predict(features_test))
    
    id3_predicted.extend(id3.predict(features_test))
    naive_bayes_predicted.extend(naive_bayes.predict(features_test))
    neuralnet_predicted.extend(neuralnet.predict(features_test, classification=True))
    
    
    bar.update()



bar.close()

100%|██████████| 10/10 [03:43<00:00, 20.38s/it]


In [5]:
print('sklearn.tree.DecisionTreeClassifier:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, sk_dtc_predicted)))
print(classification_report(labels_true, sk_dtc_predicted))

print('\n\nsklearn.naive_bayes.GaussianNB:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, sk_gausNB_predicted)))
print(classification_report(labels_true, sk_gausNB_predicted))

print('\n\nsklearn.neural_network.MLPClassifier:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, sk_mlpc_predicted)))
print(classification_report(labels_true, sk_mlpc_predicted))


print('\n\nOur ID3 Decision Tree:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, id3_predicted)))
print(classification_report(labels_true, id3_predicted))

print('\n\nOur Naive Bayes:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, naive_bayes_predicted)))
print(classification_report(labels_true, naive_bayes_predicted))

print('\n\nOur Neural Network:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, neuralnet_predicted)))
print(classification_report(labels_true, neuralnet_predicted))

sklearn.tree.DecisionTreeClassifier:
Accuracy: 0.71
             precision    recall  f1-score   support

          0       0.72      0.74      0.73       336
          1       0.72      0.72      0.72       328
          2       0.68      0.67      0.67       336

avg / total       0.71      0.71      0.71      1000



sklearn.naive_bayes.GaussianNB:
Accuracy: 0.85
             precision    recall  f1-score   support

          0       0.88      0.88      0.88       336
          1       0.84      0.91      0.87       328
          2       0.83      0.77      0.80       336

avg / total       0.85      0.85      0.85      1000



sklearn.neural_network.MLPClassifier:
Accuracy: 0.90
             precision    recall  f1-score   support

          0       0.91      0.92      0.92       336
          1       0.90      0.91      0.90       328
          2       0.88      0.86      0.87       336

avg / total       0.90      0.90      0.90      1000



Our ID3 Decision Tree:
Accuracy: 0.71
