	
# classifier.py 
This Python notebook uses a few different methods to built classifiers for the data provided by Dr. Chawla's challenge assignment.  Based on how the classifiers perform according to various validation frameworks, it chooses the best classifier to predict classes for the data included as testing data.  It writes this in a one column format to a file called "classification_test.prediction"
### Margaret Thomann & Michael McRoskey	
#### May 7 2018

In [61]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from IPython.display import HTML, display
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
import tabulate

### Defining the training, testing and prediction files

In [62]:
TRAIN_FILE = "../data/classification_train.data"
TEST_FILE = "../data/classification_test.test"
PREDICTION_FILE = "../data/classification_test.predictions"

### Reading training and testing data into data frames

In [63]:
# Reading training data into pandas data frame
features_train = ["f01", "f02", "f03", "f04", "f05", "f06", 
			"f07", "f08", "f09", "f10", "f11", "f12", 
			"f13", "f14", "f15", "f16", "f17", "f18", 
			"f19", "f20", "f21", "f22", "f23", "f24", 
			"f25", "f26", "f27", "f28", "f29", "f30", 
			"f31", "f32", "f33", "f34", "f35", "f36", 
			"f37", "f38", "f39", "f40", "f41", "f42", 
			"f43", "f44", "f45", "f46", "f47", "f48", 
			"Class"]
df_train = pd.read_csv(TRAIN_FILE, names=features_train)

# Read testing data into pandas data frame
features_test = ["f01", "f02", "f03", "f04", "f05", "f06", 
			"f07", "f08", "f09", "f10", "f11", "f12", 
			"f13", "f14", "f15", "f16", "f17", "f18", 
			"f19", "f20", "f21", "f22", "f23", "f24", 
			"f25", "f26", "f27", "f28", "f29", "f30", 
			"f31", "f32", "f33", "f34", "f35", "f36", 
			"f37", "f38", "f39", "f40", "f41", "f42", 
			"f43", "f44", "f45", "f46", "f47", "f48"]
df_test = pd.read_csv(TEST_FILE, names=features_test)

# Define x and y from data
#     x_* arrays will be transformed later in "Feature Selection"
x = df_train.drop(['Class'], axis=1)
x_a = df_train.drop(['Class'], axis=1)
x_b = df_train.drop(['Class'], axis=1)
x_c = df_train.drop(['Class'], axis=1)
y = df_train['Class']

### Feature Selection
* **a** : x_a is the non-transformed data; no feature selection has been performed 
* **b** : x_b is the transformed data using the ExtraTreesClassifier 
* **c** : x_c is the transformed data using the VarianceThreshold 

In [64]:
# Tree Based Feature Selection
clf = ExtraTreesClassifier()
clf = clf.fit(x, y)
model = SelectFromModel(clf, prefit=True)
x_b = model.transform(x)

# Low Variance Removal
sel = VarianceThreshold(threshold=(.99 * (1 - .99)))
x_c = sel.fit_transform(x_c)

print "x_a shape:", x_a.shape
print "x_b shape:", x_b.shape
print "x_c shape:", x_c.shape

x_a shape: (4147, 48)
x_b shape: (4147, 10)
x_c shape: (4147, 41)


### Building & validating classifier

In [65]:
print "---Building Classifiers---" 
# Define the number of classifiers for performance metrics later
NUM_CLASSIFIERS = 3

# Loop through to build classifiers for each transformed x array
x_array = [x_a, x_b, x_c]
results = []
for i in range(len(x_array)):
    
    '''MLP Clsssifier'''
    clf_mlp = MLPClassifier()
    clf_mlp.fit(x_array[i], y)
    cv_results_mlp = cross_val_score(clf_mlp, x_array[i], y, cv=10, scoring="accuracy")
    results.append(cv_results_mlp)
    print "\t * MLP Classifier built for iteration: ", i

    '''Gaussian Naive Bayes Classifier'''
    clf_gnb = GaussianNB()
    clf_gnb.fit(x_array[i], y)
    cv_results_gnb = cross_val_score(clf_gnb, x_array[i], y, cv=10, scoring="accuracy")
    results.append(cv_results_gnb)
    print "\t * Gaussian Naive Bayes Classifier built for iteration: ", i 
    
    '''Decision Trees Classifier'''
    clf_dt = DecisionTreeClassifier()
    clf_dt.fit(x_array[i], y)
    cv_results_dt = cross_val_score(clf_dt, x_array[i], y, cv=10, scoring="accuracy")
    results.append(cv_results_dt)
    print "\t * Decision Trees Classifier built for iteration: ", i 
    print "\n"
    


---Building Classifiers---
	 * MLP Classifier built for iteration:  0
	 * AdaBoost Classifier built for iteration:  0
	 * Decision Trees Classifier built for iteration:  0
	 * MLP Classifier built for iteration:  1
	 * AdaBoost Classifier built for iteration:  1
	 * Decision Trees Classifier built for iteration:  1
	 * MLP Classifier built for iteration:  2
	 * AdaBoost Classifier built for iteration:  2
	 * Decision Trees Classifier built for iteration:  2


### Classifier performance metrics

In [70]:
print("---Performance Metrics---")
table = []
table.append(["", "MLP Classifier Accuracy", "Gaussian Naive Bayes Classifier Accuracy", "Decision Trees Classifier Accuracy"])
column_names = ["a. No Feature Selection", "b. Extra Trees Feature Selection", "c. Variance Threshold Feature Selection"]
table_row = []
col_counter = 0
for i in range(len(x_array)*NUM_CLASSIFIERS):
    mean = results[i].mean()
    std = results[i].std()
    str_results = "%0.2f (+/- %0.2f)" % (mean, std * 2)
    table_row.append(str_results)
    if ((i+1) % 3 == 0):
        table_row.insert(0, column_names[col_counter])
        table.append(table_row)
        table_row = []
        col_counter += 1

display(HTML(tabulate.tabulate(table, tablefmt='html')))


---Performance Metrics---


0,1,2,3
,MLP Classifier Accuracy,Gaussian Naive Bayes Classifier Accuracy,Decision Trees Classifier Accuracy
a. No Feature Selection,0.80 (+/- 0.05),0.77 (+/- 0.03),0.80 (+/- 0.04)
b. Extra Trees Feature Selection,0.78 (+/- 0.02),0.84 (+/- 0.03),0.80 (+/- 0.02)
c. Variance Threshold Feature Selection,0.81 (+/- 0.03),0.78 (+/- 0.03),0.80 (+/- 0.04)


### Predicting using classifier on test data

In [69]:
x = df_train.drop(['Class'], axis=1)
y_train = df_train['Class']

# Feature selection
clf = ExtraTreesClassifier()
clf = clf.fit(x, y)
model = SelectFromModel(clf, prefit=True)
x_train = model.transform(x)

# Fit Model
final_classifier = GaussianNB()
final_classifier.fit(x_train, y_train)

# Feature selection on test
x_test = model.transform(df_test)

# Predict model
predictions = final_classifier.predict(x_test)
with open(PREDICTION_FILE, "w+") as f:
    for number in predictions:
        f.write(str(number)+"\n")