In [10]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from numpy import savetxt
import math

from sklearn import preprocessing, cross_validation, neighbors
import compute_measure
import pandas as pd
from numpy import genfromtxt, savetxt

In [11]:
def median(lst):
	lst = sorted(lst)
	if len(lst) < 1:
			return None
	if len(lst) %2 == 1:
			return lst[((len(lst)+1)/2)-1]
	else:
			return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0

In [12]:
df = pd.read_csv("data.csv")

X = np.array(df.drop(['class'],1))
y = np.array(df['class'])


In [13]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)
#split 10% of the dataset to test the performance of each classifiers
A_train, A_test, b_train, b_test = cross_validation.train_test_split(X_train,y_train,test_size=0.2)

In [14]:
## name list of classifiers
names = ["k-NN",
		 "DecisionTree",
		 "RandomForest",
		 "AdaBoost",
		 "NaiveBayes",
		 "QDA",
		 "Gradient Boosting",
		 "LDA",
		 "MLP"]

In [15]:
classifiers = [
		KNeighborsClassifier(3),
		DecisionTreeClassifier(max_depth=5),
		RandomForestClassifier(n_estimators=100, oob_score=True),
		AdaBoostClassifier(),
		GaussianNB(),
		QuadraticDiscriminantAnalysis(),
		GradientBoostingClassifier(),
		LinearDiscriminantAnalysis(),
		MLPClassifier(alpha=1e-5, hidden_layer_sizes=(200, 1),  max_iter=10000)]

In [16]:
classifier_no = len(classifiers)

misclassified_number_list= []

copy_names = list(names)
predictions = []
answers = []

print("\nAnalyzing Classifiers' Performance on Small Dataset:")
for name, clf in zip(names, classifiers):
	clf.fit(A_train, b_train)
	pred = clf.predict(A_test)
	predictions.append(pred)
	ans = compute_measure.compute_measure(pred, b_test)
	answers.append(ans)
	accuracy = ans[0]
	sen = ans[1]
	spec = ans[2]
	print(name)
	print("accuracy, sen, spec, ppr, npr:")
	print("{}".format(ans))
	misclassified_number = (b_test.size)*(1-accuracy)
	print("Misclassified Number: {} out of {}".format(int(misclassified_number), b_test.size))
	misclassified_number_list.append(int(misclassified_number))
	d = math.log(1+accuracy)+math.log(1+(float(sen+spec)/2))
	print("D: {}".format(d))



Analyzing Classifiers' Performance on Small Dataset:
k-NN
accuracy, sen, spec, ppr, npr:
[0.99420289855072463, 0.979009235936188, 0.9987449799196787, 0.9957301451750641, 0.9937562437562437]
Misclassified Number: 30 out of 5175
D: 1.37781463301
DecisionTree
accuracy, sen, spec, ppr, npr:
[0.994975845410628, 0.9848866498740554, 0.9979919678714859, 0.9932260795935648, 0.9954932398597897]
Misclassified Number: 26 out of 5175
D: 1.3794895908
RandomForest
accuracy, sen, spec, ppr, npr:
[1.0, 1.0, 1.0, 1.0, 1.0]
Misclassified Number: 0 out of 5175
D: 1.38629436112
AdaBoost
accuracy, sen, spec, ppr, npr:
[0.99961352657004832, 0.998320738874895, 1.0, 1.0, 0.9994982438534872]
Misclassified Number: 1 out of 5175
D: 1.3856812023
NaiveBayes
accuracy, sen, spec, ppr, npr:
[0.95903381642512076, 0.9076406381192276, 0.9743975903614458, 0.9137785291631445, 0.9724448897795591]
Misclassified Number: 212 out of 5175
D: 1.33566455266
QDA
accuracy, sen, spec, ppr, npr:
[0.85198067632850238, 0.93786733837111

In [17]:
new_classifiers = sorted(zip(misclassified_number_list, copy_names, classifiers, predictions, answers))[:3]

print("Top Three Classifiers")

prediction_results = []
for misclassified_number, name, clf, pred, ans in new_classifiers:
	print(name)
	print("{}".format(ans))
	print("Misclassified Number: {} out of {}".format(int(misclassified_number), b_test.size))
	prediction_results.append(pred)

Top Three Classifiers
RandomForest
[1.0, 1.0, 1.0, 1.0, 1.0]
Misclassified Number: 0 out of 5175
AdaBoost
[0.99961352657004832, 0.998320738874895, 1.0, 1.0, 0.9994982438534872]
Misclassified Number: 1 out of 5175
Gradient Boosting
[0.99922705314009663, 0.9974811083123426, 0.9997489959839357, 0.9991589571068125, 0.9992473657802308]
Misclassified Number: 3 out of 5175


In [18]:
prediction_results = []

for misclassified_number, name, clf, pred, ans in new_classifiers:
	print(name)
	clf.fit(X_train, y_train)
	pred = clf.predict(X_test)
	prediction_results.append(pred)
	ans = compute_measure.compute_measure(pred, y_test)
	print("{}".format(ans))
	print(int(y_test.size * (1-ans[0])))
	print("Misclassified Number: {} out of {}".format(int(y_test.size * (1-ans[0])), y_test.size))
	accuracy = ans[0]
	sen = ans[1]
	spec = ans[2]
	d = math.log(1+accuracy)+math.log(1+(float(sen+spec)/2))
	print("D: {}".format(d))

RandomForest
[0.9992270830112846, 0.998022412656559, 0.9995961227786753, 0.9986807387862797, 0.9993943064809206]
4
Misclassified Number: 4 out of 6469
D: 1.38531228449
AdaBoost
[0.9992270830112846, 0.998022412656559, 0.9995961227786753, 0.9986807387862797, 0.9993943064809206]
4
Misclassified Number: 4 out of 6469
D: 1.38531228449
Gradient Boosting
[0.9992270830112846, 0.998022412656559, 0.9995961227786753, 0.9986807387862797, 0.9993943064809206]
4
Misclassified Number: 4 out of 6469
D: 1.38531228449
