In [1]:
# load dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import time

In [2]:
# read in data
raw_data = pd.read_csv("data.csv")

FileNotFoundError: File b'data.csv' does not exist

In [None]:
# assign X, y data set and split to training and testing data sets
y = raw_data['target'].values
X = raw_data.drop('target', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)

In [None]:
# generate model list to be tested
models_list = []
models_list.append(('DT', DecisionTreeClassifier()))
models_list.append(('SVM', SVC())) 
models_list.append(('NB', GaussianNB()))
models_list.append(('KNN', KNeighborsClassifier()))
models_list.append(('RF', RandomForestClassifier()))
models_list.append(('NN', MLPClassifier()))

In [None]:
# evaluate models performance
num_folds = 10
results = []
names = []

for name, model in models_list:
    kfold = KFold(n_splits=num_folds, random_state=123)
    start = time.time()
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    end = time.time()
    results.append(cv_results)
    names.append(name)
    print( "%s: %f (%f) (run time: %f)" % (name, cv_results.mean(), cv_results.std(), end-start))

In [None]:
# plot the results
fig = plt.figure()
fig.suptitle('Before Performance Comparison(Train)')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
plt.savefig("performenceBefore.png")

In [None]:
# evaluate models performance
num_folds = 10
results = []
names = []

for name, model in models_list:
    kfold = KFold(n_splits=num_folds, random_state=123)
    start = time.time()
    cv_results = cross_val_score(model, X_test, y_test, cv=kfold, scoring='accuracy')
    end = time.time()
    results.append(cv_results)
    names.append(name)
    print( "%s: %f (%f) (run time: %f)" % (name, cv_results.mean(), cv_results.std(), end-start))

In [None]:
# plot the results
fig = plt.figure()
fig.suptitle('Before Performance Comparison(test)')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
plt.savefig("performenceBefore.png")

In [None]:
# using Pipeline and evaluate models again using scaled data

import warnings

# Standardize the dataset
pipelines = []

pipelines.append(('ScaledDT', Pipeline([('Scaler', StandardScaler()),('CART',
                                                                        DecisionTreeClassifier(max_depth=4, random_state=0))])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('SVM', SVC(C=1000))])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB',
                                                                      GaussianNB())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN',
                                                                       KNeighborsClassifier(n_neighbors = 8))])))
pipelines.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()), ('RF', 
                                                                       RandomForestClassifier(max_depth=4, random_state=0))])))
pipelines.append(('ScaledNN', Pipeline([('Scaler', StandardScaler()), ('NN', 
                                                                       MLPClassifier(hidden_layer_sizes=5, activation='relu',  alpha=0.5,  learning_rate='constant',   max_iter=200))])))
results = []
names = []
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    kfold = KFold(n_splits=num_folds, random_state=123)
    for name, model in pipelines:
        start = time.time()
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        end = time.time()
        results.append(cv_results)
        names.append(name)
        print( "%s: %f (%f) (run time: %f)" % (name, cv_results.mean(), cv_results.std(), end-start))

In [None]:
# plot the results
fig = plt.figure()
fig.suptitle('After Performance Comparison(train)')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()



In [None]:
# using Pipeline and evaluate models again using scaled data

import warnings

# Standardize the dataset
pipelines = []

pipelines.append(('ScaledDT', Pipeline([('Scaler', StandardScaler()),('CART',
                                                                        DecisionTreeClassifier(max_depth=4, random_state=0))])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('SVM', SVC(C=1000))])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB',
                                                                      GaussianNB())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN',
                                                                       KNeighborsClassifier(n_neighbors = 8))])))
pipelines.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()), ('RF', 
                                                                       RandomForestClassifier(max_depth=4, random_state=0))])))
pipelines.append(('ScaledNN', Pipeline([('Scaler', StandardScaler()), ('NN', 
                                                                       MLPClassifier(hidden_layer_sizes=5, activation='relu',  alpha=0.5,  learning_rate='constant',   max_iter=200))])))
results = []
names = []
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    kfold = KFold(n_splits=num_folds, random_state=123)
    for name, model in pipelines:
        start = time.time()
        cv_results = cross_val_score(model, X_test, y_test, cv=kfold, scoring='accuracy')
        end = time.time()
        results.append(cv_results)
        names.append(name)
        print( "%s: %f (%f) (run time: %f)" % (name, cv_results.mean(), cv_results.std(), end-start))

In [None]:
# plot the results
fig = plt.figure()
fig.suptitle('After Performance Comparison(test)')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()


