In [16]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut, GridSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score

In [17]:
iris_dataset = load_iris()
# print(iris_dataset.keys())
print(iris_dataset['target_names'])
print(iris_dataset['feature_names'])
print(iris_dataset['data'].shape, type(iris_dataset['data']))
print(iris_dataset['target'].shape, type(iris_dataset['target']))

X_train, X_test, y_train, y_test = train_test_split(iris_dataset.data, iris_dataset.target, stratify=iris_dataset.target, random_state=2022)

['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
(150, 4) <class 'numpy.ndarray'>
(150,) <class 'numpy.ndarray'>


In [18]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print("Accuracy on test set: {:.3f}".format(accuracy_score(y_test, y_pred)))

# performance measures - confusion matrix, precision, recall, F1
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy on test set: 0.947
[[13  0  0]
 [ 0 11  2]
 [ 0  0 12]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      0.85      0.92        13
           2       0.86      1.00      0.92        12

    accuracy                           0.95        38
   macro avg       0.95      0.95      0.95        38
weighted avg       0.95      0.95      0.95        38



In [19]:
scores = cross_val_score(nb, iris_dataset.data, iris_dataset.target, cv=10)
print("Cross-validation scores: {}".format(scores)) #accuracy for each fold
print("Average cross-validation score: {:.2f}".format(scores.mean())) #average accuracy over all folds

Cross-validation scores: [0.93333333 0.93333333 1.         0.93333333 0.93333333 0.93333333
 0.86666667 1.         1.         1.        ]
Average cross-validation score: 0.95


In [20]:
scores = cross_val_score(nb, iris_dataset.data, iris_dataset.target, cv=LeaveOneOut())
print("Number of evaluations: ", len(scores))
print("Mean accuracy: {:.2f}".format(scores.mean()))

Number of evaluations:  150
Mean accuracy: 0.95


In [21]:
param_grid = {'n_neighbors': [1, 3, 5, 11, 15],
              'p': [1, 2]  # distance measure - Manhattan and Euclidean
             }           
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=10, return_train_score=True)
grid_search.fit(X_train, y_train)

print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator:\n{}".format(grid_search.best_estimator_))

Test set score: 0.95
Best parameters: {'n_neighbors': 11, 'p': 1}
Best cross-validation score: 0.98
Best estimator:
KNeighborsClassifier(n_neighbors=11, p=1)
