In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Cleveland Heart Dataset
heart_data = pd.read_csv("processed_cleveland.csv")
# Replace missing data with the mean of the column (imputing data)
# Replace '?' with NaN
heart_data = heart_data.replace('?', np.nan)
# Convert columns to float
heart_data = heart_data.astype(float)
# Replace NaN with mean of column
heart_data = heart_data.apply(lambda x: x.fillna(x.mean()),axis=0)

X_train, X_test, y_train, y_test = train_test_split(heart_data.drop('goal', axis=1), heart_data['goal'], test_size=0.3, random_state=101)

In [3]:
# build Naive Bayes model with unscaled data
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [4]:
# build LDA model with unscaled data
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_predict_sk = lda.predict(X_test)

In [5]:
# build KNN model(using GridSearch to find the best k using 5-fold validation) with unscaled data
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 25)}
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(X_train, y_train)
# save the best model
knn = knn_gscv.best_estimator_

In [6]:
# build SVM model(using GridSearch to find the best kernel) with unscaled data
svm = SVC()
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
svm_gscv = GridSearchCV(svm, param_grid, cv=5)
svm_gscv.fit(X_train, y_train)
# save the best model
svm = svm_gscv.best_estimator_

In [7]:
# build NN model with unscaled data
nn = MLPClassifier()
nn.fit(X_train, y_train)

MLPClassifier()

In [8]:
# build Logistic Regression model with unscaled data
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression()

In [9]:
# check accuracies of individual models with unscaled data
print("NB accuracy: ", nb.score(X_test, y_test))
print("LDA accuracy: ", lda.score(X_test, y_test))
print("KNN accuracy: ", knn.score(X_test, y_test))
print("SVM accuracy: ", svm.score(X_test, y_test))
print("NN accuracy: ", nn.score(X_test, y_test))
print("Logistic Regression accuracy: ", log_reg.score(X_test, y_test))

NB accuracy:  0.5824175824175825
LDA accuracy:  0.5934065934065934
KNN accuracy:  0.4945054945054945
SVM accuracy:  0.5934065934065934
NN accuracy:  0.5604395604395604
Logistic Regression accuracy:  0.5934065934065934


In [10]:
#create a dictionary of our models
estimators=[('nb',nb),('lda',lda),('knn', knn), ('svm', svm), ('nn', nn), ('log_reg', log_reg)]
#create our voting classifier, inputting our models, using voting='hard' to select by majority vote
ensemble = VotingClassifier(estimators, voting='hard')
#fit model to training data
ensemble.fit(X_train, y_train)
#test our model on the test data
print("Ensemble voting classifier on unscaled Cleveland Heart dataset: ", ensemble.score(X_test, y_test))

Ensemble voting classifier on unscaled Cleveland Heart dataset:  0.5714285714285714


In [11]:
# Scale the data
X_scaler = StandardScaler()

# Fit the scaler to the training data
X_scaler.fit(X_train)

# Scale the training and test data
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

In [12]:
# build Naive Bayes model with scaled data
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [13]:
# build LDA model with scaled data
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_predict_sk = lda.predict(X_test)

In [14]:
# build KNN model(using GridSearch to find the best k using 5-fold validation) with scaled data
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 25)}
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(X_train, y_train)
# save the best model
knn = knn_gscv.best_estimator_

In [15]:
# build SVM model(using GridSearch to find the best kernel) with scaled data
svm = SVC()
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
svm_gscv = GridSearchCV(svm, param_grid, cv=5)
svm_gscv.fit(X_train, y_train)
# save the best model
svm = svm_gscv.best_estimator_

In [16]:
# build NN model with scaled data
nn = MLPClassifier()
nn.fit(X_train, y_train)

MLPClassifier()

In [17]:
# build Logistic Regression model with scaled data
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression()

In [18]:
# check accuracies of individual models with scaled data
print("NB accuracy: ", nb.score(X_test, y_test))
print("LDA accuracy: ", lda.score(X_test, y_test))
print("KNN accuracy: ", knn.score(X_test, y_test))
print("SVM accuracy: ", svm.score(X_test, y_test))
print("NN accuracy: ", nn.score(X_test, y_test))
print("Logistic Regression accuracy: ", log_reg.score(X_test, y_test))

NB accuracy:  0.5824175824175825
LDA accuracy:  0.5934065934065934
KNN accuracy:  0.6043956043956044
SVM accuracy:  0.6043956043956044
NN accuracy:  0.5824175824175825
Logistic Regression accuracy:  0.5604395604395604


In [19]:
#create a dictionary of our models
estimators=[('nb',nb),('lda',lda),('knn', knn), ('svm', svm), ('nn', nn), ('log_reg', log_reg)]
#create our voting classifier, inputting our models, using voting='hard' to select by majority vote
ensemble = VotingClassifier(estimators, voting='hard')
#fit model to training data
ensemble.fit(X_train, y_train)
#test our model on the test data
print("Ensemble voting classifier on scaled Cleveland Heart dataset: ", ensemble.score(X_test, y_test))

Ensemble voting classifier on scaled Cleveland Heart dataset:  0.5934065934065934


In [20]:
#Dry Beans Dataset
bean_data = pd.read_csv("Dry_Bean_Dataset.csv")

# Encode the Class values as integers
encoder = LabelEncoder()
encoder.fit(bean_data['Class'])
encoded_Y = encoder.transform(bean_data['Class'])

X_train, X_test, y_train, y_test = train_test_split(bean_data.drop('Class', axis=1), encoded_Y, test_size=0.3, random_state=101)

In [21]:
# build Naive Bayes model with unscaled data
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [22]:
# build LDA model with unscaled data
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_predict_sk = lda.predict(X_test)

In [23]:
# build KNN model(using GridSearch to find the best k using 5-fold validation) with unscaled data
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 25)}
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(X_train, y_train)
# save the best model
knn = knn_gscv.best_estimator_

In [24]:
# build SVM model(using GridSearch to find the best kernel) with unscaled data
svm = SVC()
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
svm_gscv = GridSearchCV(svm, param_grid, cv=5)
svm_gscv.fit(X_train, y_train)
# save the best model
svm = svm_gscv.best_estimator_

In [25]:
# build NN model with unscaled data
nn = MLPClassifier()
nn.fit(X_train, y_train)

MLPClassifier()

In [26]:
# build Logistic Regression model with unscaled data
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression()

In [27]:
# check accuracies of individual models with unscaled data
print("NB accuracy: ", nb.score(X_test, y_test))
print("LDA accuracy: ", lda.score(X_test, y_test))
print("KNN accuracy: ", knn.score(X_test, y_test))
print("SVM accuracy: ", svm.score(X_test, y_test))
print("NN accuracy: ", nn.score(X_test, y_test))
print("Logistic Regression accuracy: ", log_reg.score(X_test, y_test))

NB accuracy:  0.7659157688540646
LDA accuracy:  0.9094025465230167
KNN accuracy:  0.7416748285994124
SVM accuracy:  0.9177277179236043
NN accuracy:  0.3922624877571009
Logistic Regression accuracy:  0.7029872673849168


In [28]:
#create a dictionary of our models
estimators=[('nb',nb),('lda',lda),('knn', knn), ('svm', svm), ('nn', nn), ('log_reg', log_reg)]

#create our voting classifier, inputting our models, using voting='hard' to select by majority vote
ensemble = VotingClassifier(estimators, voting='hard')

#fit model to training data
ensemble.fit(X_train, y_train)

#test our model on the test data
print("Ensemble voting classifier on unscaled Dry Beans dataset: ", ensemble.score(X_test, y_test))

Ensemble voting classifier on unscaled Dry Beans dataset:  0.8780607247796278


In [29]:
# Scale the data
X_scaler = StandardScaler()

# Fit the scaler to the training data
X_scaler.fit(X_train)

# Scale the training and test data
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

In [30]:
# build Naive Bayes model with scaled data
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [31]:
# build LDA model with scaled data
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_predict_sk = lda.predict(X_test)

In [32]:
# build KNN model(using GridSearch to find the best k using 5-fold validation) with scaled data
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 25)}
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(X_train, y_train)
# save the best model
knn = knn_gscv.best_estimator_

In [33]:
# build SVM model(using GridSearch to find the best kernel) with scaled data
svm = SVC()
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
svm_gscv = GridSearchCV(svm, param_grid, cv=5)
svm_gscv.fit(X_train, y_train)
# save the best model
svm = svm_gscv.best_estimator_

In [34]:
# build NN model with scaled data
nn = MLPClassifier()
nn.fit(X_train, y_train)

MLPClassifier()

In [35]:
# build Logistic Regression model with scaled data
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression()

In [36]:
# check accuracies of individual models with scaled data
print("NB accuracy: ", nb.score(X_test, y_test))
print("LDA accuracy: ", lda.score(X_test, y_test))
print("KNN accuracy: ", knn.score(X_test, y_test))
print("SVM accuracy: ", svm.score(X_test, y_test))
print("NN accuracy: ", nn.score(X_test, y_test))
print("Logistic Regression accuracy: ", log_reg.score(X_test, y_test))

NB accuracy:  0.9000979431929481
LDA accuracy:  0.9094025465230167
KNN accuracy:  0.9231145935357493
SVM accuracy:  0.9309500489715965
NN accuracy:  0.9289911851126347
Logistic Regression accuracy:  0.9223800195886386


In [37]:
#create a dictionary of our models
estimators=[('nb',nb),('lda',lda),('knn', knn), ('svm', svm), ('nn', nn), ('log_reg', log_reg)]
#create our voting classifier, inputting our models, using voting='hard' to select by majority vote
ensemble = VotingClassifier(estimators, voting='hard')
#fit model to training data
ensemble.fit(X_train, y_train)
#test our model on the test data
print("Ensemble voting classifier on scaled Dry Beans dataset: ", ensemble.score(X_test, y_test))

Ensemble voting classifier on scaled Dry Beans dataset:  0.9292360430950048


In [None]:
#1.	Which of the following achieved higher accuracy?
#- b.	The classifier trained on the scaled dry bean dataset
#2.	Which of the following achieved higher accuracy?
#- b.	The classifier trained on the scaled Cleveland heart dataset
#3.	Did any of the individual classifiers achieve higher accuracy than the voting classifiers? If so, list the individual classifier(s) and give a brief explanation of why/how this occurred.
#- Yes, lDA classifier on scaled data and SVM classifier on both scaled and unscaled achieved higher accuracy than the voting classifier. As LDA classifier is linear in nature it performed better with scaled data as compared to ensemble voting classifier which averages the best possible result from other classfiers.
# The SVM classifier achieved better results than ensemble voting classifier for both scaled and unscaled data because,hard ensemble voting classifier choses the highest weighted mean for the result amongst the other classifier which is not as good as SVM classifier for large datasets, as SVM classifier can choose different kernels for different datasets, it is better in accuracy and storage for large datsets. 