In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification, load_wine
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

from ID3 import ID3
from Neural_Network import Network, relu
from Naive_Bayes import NaiveBayesClassifier

In [2]:
#### make_classification notes:
# n_samples : default=100
# n_features : default=20

### n_classes * n_clusters_per_class must be smaller or equal 2 ** n_informative
# n_informative : default=2
# n_classes : default=2
# n_clusters_per_class : default=2

# n_redundant : default=2
# n_repeated : default=0

# A higher class_sep makes the classification task easier
# class_sep : default=1.0


features, labels = make_classification(n_samples=3000, n_classes=3, n_informative=20,
                                       n_features=30, class_sep=1.5)



In [3]:
# test_size default is 25% of given list

# features_train, features_test, labels_train, labels_test = train_test_split(features, labels)



In [4]:

# # sklearn's decision tree
# dtc = DecisionTreeClassifier()
# dtc = dtc.fit(features_train, labels_train)
# dtc_prediction = dtc.predict(features_test)
# print('skearn.tree.DecisionTreeClassifier:')
# print(classification_report(labels_test, dtc_prediction))

# # our ID3 decision tree
# id3 = ID3()
# id3.fit(features_train, labels_train)
# predictions = id3.predict(features_test)
# print('\n\nOur ID3 Decision Tree:')
# print(classification_report(labels_test, predictions))


In [5]:
# # sklearns naive bayes
# gausNB = GaussianNB()
# gausNB.fit(features_train, labels_train)
# gausNB_prediction = gausNB.predict(features_test)
# print('sklearn.naive_bayes.GaussianNB')
# print(classification_report(labels_test, gausNB_prediction))


# # nbc = NaiveBayesClassifier()
# # nbc.fit(features_train, labels_train)
# # nbc_prediction = nbc.predict(features_test)
# # print(classification_report(labels_test, nbc_prediction))


In [6]:
# labels_train_one_hot = pd.get_dummies(labels_train).as_matrix()

# neuralnet = Network(layer_structures=[40, 10, 10, 3], activation_function=relu) 
# neuralnet.fit(features_train, labels_train_one_hot, iterations=5000)
# neuralnet_prediction = neuralnet.predict(features_test, classification=True)

# print(classification_report(labels_test, netbc_prediction))

In [7]:
from sklearn.model_selection import KFold
from tqdm import tqdm_notebook

kf = KFold(n_splits=10, shuffle=True)


sk_dtc = DecisionTreeClassifier(criterion='entropy')
sk_gausNB = GaussianNB()
sk_mlpc = MLPClassifier(activation='relu'
                        , hidden_layer_sizes=(15, 15)
                        , learning_rate='constant'
                        , learning_rate_init=0.001
                        , max_iter=5000)

id3 = ID3()
naive_bayes = NaiveBayesClassifier()
# neuralnet gets created/recreated in the loop



labels_true = []

sk_dtc_predicted = []
sk_gausNB_predicted = []
sk_mlpc_predicted = []

id3_predicted = []
naive_bayes_predicted = []
neuralnet_predicted = []


In [8]:
# train with numerical data

bar = tqdm_notebook(total=10)

for train_index, test_index in kf.split(features):
    features_train, features_test = features[train_index], features[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    
    # save all the of test labels
    labels_true.extend(labels_test)
    
    # convert labels to one hots for the neural network
    labels_train_one_hot = pd.get_dummies(labels_train).as_matrix()
    
    # recreate Neural Network, for now it doesn't reset its weight, thus make a new one
    neuralnet = Network(layer_structures=[30, 15, 15, 3], activation_function=relu)
    
    # train models
    sk_dtc.fit(features_train, labels_train)
    sk_gausNB.fit(features_train, labels_train)
    sk_mlpc.fit(features_train, labels_train)
    
    id3.fit(features_train, labels_train)
    naive_bayes.fit(features_train, labels_train)
    neuralnet.fit(features_train, labels_train_one_hot, iterations=5000)
    
        
    # get predictions
    sk_dtc_predicted.extend(sk_dtc.predict(features_test))
    sk_gausNB_predicted.extend(sk_gausNB.predict(features_test))
    sk_mlpc_predicted.extend(sk_mlpc.predict(features_test))
    
    id3_predicted.extend(id3.predict(features_test))
    naive_bayes_predicted.extend(naive_bayes.predict(features_test))
    neuralnet_predicted.extend(neuralnet.predict(features_test, classification=True))
    
    
    bar.update()



bar.close()




In [13]:
print('sklearn.tree.DecisionTreeClassifier:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, sk_dtc_predicted)))
print(classification_report(labels_true, sk_dtc_predicted))

print('\n\nsklearn.naive_bayes.GaussianNB:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, sk_gausNB_predicted)))
print(classification_report(labels_true, sk_gausNB_predicted))

print('\n\nsklearn.neural_network.MLPClassifier:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, sk_mlpc_predicted)))
print(classification_report(labels_true, sk_mlpc_predicted))


print('\n\nOur ID3 Decision Tree:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, id3_predicted)))
print(classification_report(labels_true, id3_predicted))

print('\n\nOur Naive Bayes:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, naive_bayes_predicted)))
print(classification_report(labels_true, naive_bayes_predicted))

print('\n\nOur Neural Network:')
print('Accuracy: {:.2f}'.format(accuracy_score(labels_true, neuralnet_predicted)))
print(classification_report(labels_true, neuralnet_predicted))

sklearn.tree.DecisionTreeClassifier:
Accuracy: 0.77
             precision    recall  f1-score   support

          0       0.79      0.79      0.79      1000
          1       0.77      0.78      0.78      1000
          2       0.76      0.75      0.76      1000

avg / total       0.77      0.77      0.77      3000



sklearn.naive_bayes.GaussianNB:
Accuracy: 0.82
             precision    recall  f1-score   support

          0       0.85      0.82      0.84      1000
          1       0.82      0.85      0.84      1000
          2       0.79      0.80      0.80      1000

avg / total       0.82      0.82      0.82      3000



sklearn.neural_network.MLPClassifier:
Accuracy: 0.95
             precision    recall  f1-score   support

          0       0.95      0.95      0.95      1000
          1       0.95      0.95      0.95      1000
          2       0.94      0.94      0.94      1000

avg / total       0.95      0.95      0.95      3000



Our ID3 Decision Tree:
Accuracy: 0.78
