In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, f1_score
import numpy as np
from csv import reader

# Load a CSV file
def load_csv(filename, skip_header = True, return_names = True):
    dataset = list()
    labels = list()
    names = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            labels.append(row.pop())
            dataset.append(row)
        if return_names:
            names = dataset[0:]
        if skip_header:
            dataset = dataset[1:]
            labels = labels[1:]
        if return_names:    
            return np.array(dataset, dtype = 'float'), np.array(labels), np.array(names)
        else:
            return np.array(dataset, dtype = 'float'), np.array(labels)

In [2]:
data, labels, names = load_csv('Dry_Bean_Dataset.csv', skip_header = True)
x_train, x_test, y_train, y_test = train_test_split(data, labels)

In [3]:
# Run scikitlearn LinearDiscriminatorAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
y_predict_sk = lda.predict(x_test)
print("f1 score of scikit-learn model is: ", f1_score(y_test, y_predict_sk, average="weighted"))
print("accuracy of scikit-learn model is: ", accuracy_score(y_test, y_predict_sk))

f1 score of scikit-learn model is:  0.8991030930175596
accuracy of scikit-learn model is:  0.8968557155451072


In [4]:
# Run scikitlearn KNearestNeighbour
f1s = list()
accs = list()
for i in range(1,11):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    f1s.append(f1_score(y_test, y_pred, average="weighted"))
    accs.append(accuracy_score(y_test, y_pred))
    
f1_scores = np.array(f1s)
accuracies = np.array(accs)

for i in range(len(f1_scores)):
    print("f1_score for k=", i + 1, ": ", f1_scores[i])
    print("accuracy for k=", i + 1, ": ", accuracies[i])

print("maximum f1 score was for k=", np.argmax(f1_scores) + 1)
print("maximum accuracy was for k=", np.argmax(accuracies) + 1)

f1_score for k= 1 :  0.7430042551125777
accuracy for k= 1 :  0.7425800764031737
f1_score for k= 2 :  0.7188554625542919
accuracy for k= 2 :  0.7211284161034381
f1_score for k= 3 :  0.7323300754501477
accuracy for k= 3 :  0.7331766088745225
f1_score for k= 4 :  0.732438492041275
accuracy for k= 4 :  0.7337643255950632
f1_score for k= 5 :  0.7291556783249703
accuracy for k= 5 :  0.7317073170731707
f1_score for k= 6 :  0.7251346412577437
accuracy for k= 6 :  0.7272994416691155
f1_score for k= 7 :  0.7242536713142261
accuracy for k= 7 :  0.7278871583896562
f1_score for k= 8 :  0.7170758577499978
accuracy for k= 8 :  0.7208345577431678
f1_score for k= 9 :  0.7176511053470879
accuracy for k= 9 :  0.7223038495445195
f1_score for k= 10 :  0.711513638153381
accuracy for k= 10 :  0.7155451072583014
maximum f1 score was for k= 1
maximum accuracy was for k= 1


In [5]:
# Run scikitlearn NaiveBayes
nb = GaussianNB()
nb.fit(x_train, y_train)
y_pred = nb.predict(x_test)
print("f1 score: ", f1_score(y_test, y_pred, average="weighted"))
print("accuracy: ", accuracy_score(y_test, y_pred))

f1 score:  0.762156953122046
accuracy:  0.76461945342345


In [6]:
# Run k-fold validation with k=5
kf = KFold(n_splits = 5)
kf.get_n_splits(data)
accuracies = list()
f1_scores = list()
for trainIdx, testIdx in kf.split(data):
    lda.fit(data[trainIdx], labels[trainIdx])
    predictions = lda.predict(data[testIdx])
    acc = accuracy_score(labels[testIdx], predictions)
    f1s = f1_score(labels[testIdx], predictions, average="weighted")
    accuracies.append(acc)
    f1_scores.append(f1s)
print('Accuracies: %s' % accuracies)
print('Mean Accuracy: %.3f%%' % ((sum(accuracies)/float(len(accuracies)))*100))
print('F1 Scores: %s' % f1_scores)
print('Mean F1 Scores: %.3f%%' % ((sum(f1_scores)/float(len(f1_scores)))*100))

Accuracies: [0.1340433345574734, 0.25459221160911094, 0.17707567964731816, 0.949669360764144, 0.28545187362233654]
Mean Accuracy: 36.017%
F1 Scores: [0.1743816090691616, 0.18120223038450375, 0.10108546992627684, 0.9739062896625441, 0.44412689339811373]
Mean F1 Scores: 37.494%


In [7]:
# Run k-fold validation with k=5 & shuffle
kf = KFold(n_splits = 5, shuffle=True)
kf.get_n_splits(data)
accuracies = list()
f1_scores = list()
for trainIdx, testIdx in kf.split(data):
    lda.fit(data[trainIdx], labels[trainIdx])
    predictions = lda.predict(data[testIdx])
    acc = accuracy_score(labels[testIdx], predictions)
    f1s = f1_score(labels[testIdx], predictions, average="weighted")
    accuracies.append(acc)
    f1_scores.append(f1s)
print('Accuracies: %s' % accuracies)
print('Mean Accuracy: %.3f%%' % ((sum(accuracies)/float(len(accuracies)))*100))
print('F1 Scores: %s' % f1_scores)
print('Mean F1 Scores: %.3f%%' % ((sum(f1_scores)/float(len(f1_scores)))*100))

Accuracies: [0.906720528828498, 0.9011756061719324, 0.8989713445995592, 0.9099926524614255, 0.9088905216752388]
Mean Accuracy: 90.515%
F1 Scores: [0.9082060266453551, 0.9029563827242272, 0.9006770311592911, 0.9111949299314627, 0.9110413344357047]
Mean F1 Scores: 90.682%


In [8]:
# Run k-fold validation with k=5 using StratifiedKFold
skf = StratifiedKFold(n_splits = 5)
skf.get_n_splits(data, labels)
accuracies = list()
f1_scores = list()
for trainIdx, testIdx in skf.split(data, labels):
    lda.fit(data[trainIdx], labels[trainIdx])
    predictions = lda.predict(data[testIdx])
    acc = accuracy_score(labels[testIdx], predictions)
    f1s = f1_score(labels[testIdx], predictions, average="weighted")
    accuracies.append(acc)
    f1_scores.append(f1s)
print('Accuracies: %s' % accuracies)
print('Mean Accuracy: %.3f%%' % ((sum(accuracies)/float(len(accuracies)))*100))
print('F1 Scores: %s' % f1_scores)
print('Mean F1 Scores: %.3f%%' % ((sum(f1_scores)/float(len(f1_scores)))*100))

Accuracies: [0.7653323540213001, 0.9695077149155034, 0.9669360764144012, 0.9397501836884644, 0.6711976487876561]
Mean Accuracy: 86.254%
F1 Scores: [0.758035804722464, 0.9692123692086694, 0.9668636825111037, 0.9403375446209, 0.6170139729117534]
Mean F1 Scores: 85.029%


In [9]:
# Run k-fold validation with k=5 using StratifiedKFold & shuffle
skf = StratifiedKFold(n_splits = 5, shuffle=True)
skf.get_n_splits(data, labels)
accuracies = list()
f1_scores = list()
for trainIdx, testIdx in skf.split(data, labels):
    lda.fit(data[trainIdx], labels[trainIdx])
    predictions = lda.predict(data[testIdx])
    acc = accuracy_score(labels[testIdx], predictions)
    f1s = f1_score(labels[testIdx], predictions, average="weighted")
    accuracies.append(acc)
    f1_scores.append(f1s)
print('Accuracies: %s' % accuracies)
print('Mean Accuracy: %.3f%%' % ((sum(accuracies)/float(len(accuracies)))*100))
print('F1 Scores: %s' % f1_scores)
print('Mean F1 Scores: %.3f%%' % ((sum(f1_scores)/float(len(f1_scores)))*100))

Accuracies: [0.910025706940874, 0.8956649522409993, 0.9070536370315944, 0.9008082292432035, 0.9070536370315944]
Mean Accuracy: 90.412%
F1 Scores: [0.9114533882383399, 0.8976919261067821, 0.9083331394014968, 0.9025886551038372, 0.9089952896337384]
Mean F1 Scores: 90.581%


In [10]:
#Linear Discriminator Analysis classifier had the best accuracy of: 0.9030267411107846