## K Nearest Neighbor (KNN) Classifiers

### Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20.

In [None]:
%matplotlib notebook
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import seaborn as sn
import os

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer


# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# to make this notebook's output stable across runs
np.random.seed(42)

# Where to save the figures
PROJECT_ROOT_DIR = "."

CHAPTER_ID = 'KNN'
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
#pip install -U scikit-learn

### KNN classifier

- Read KNN classifier in Python:
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

- Check out the difference between model parameters and hyper parameters:
https://towardsdatascience.com/model-parameters-and-hyperparameters-in-machine-learning-what-is-the-difference-702d30970f6

### Build KNN classifiers on a well-known dataset, iris dataset. 


In [None]:
from IPython.display import Image

Image("images/iris.png")

### Dataset 1: iris

#### Read the data from a local file: iris.csv is stored in a folder "data"

In [None]:
# read data from CSV file to dataframe
iris = pd.read_csv('./data/iris.csv')

# define target_namees (class lables)
target_names = ['setosa', 'versicolor', 'virginica']

print(iris.head())
print(iris.tail())

# X contains the first four columns, y contains class labels
X = iris.iloc[:, :4].to_numpy()
y = iris['Class'].to_numpy()

#print(X)
#print(y)

In [None]:
iris.info()

In [None]:
from sklearn import preprocessing

data = iris.iloc[:, :4]
data.head()

min_max = preprocessing.MinMaxScaler()

col = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']
normData = min_max.fit_transform(data)

normData = pd.DataFrame(normData, columns = col)
print(normData.head())

X_normalized = normData.iloc[:, :4].to_numpy()

#X_normalized

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold 

### K-Cross Validation (k = 3) on the orignal data

- KNN classifier with varying k values, 1, 5, 10, 15, 20, 30

In [None]:
kf = KFold(n_splits=3, random_state=None, shuffle=True) # Define the split - into k folds 

k_values = [1, 5, 10, 15, 20, 25, 30]

for k in k_values:
    precision = []
    recall = []
    accuracy = []
    
    knn = KNeighborsClassifier(n_neighbors = k)
    
    train_score = []
    test_score = []
    for train_index, test_index in kf.split(X):
        
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        
        train_score.append(knn.score(X_train, y_train))
        test_score.append(knn.score(X_test, y_test))
        
        #print(f"k={k}")
        #print("training score: ", knn.score(X_train, y_train))
        #print("testing score: ", knn.score(X_test, y_test))
    
        # plot a confusion matrix
        confusion_mat = confusion_matrix(y_test, y_pred)
        #print(confusion_mat)
    
        #classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
        results = classification_report(y_test, y_pred, target_names=target_names)
        #print(results)

        metrics_dict = classification_report(y_test, y_pred, 
                        target_names=target_names, output_dict=True)

        avg_precision = metrics_dict['weighted avg']['precision']
        #print('precision (weighted):', avg_precision)
        precision.append(avg_precision)
    
        avg_recall = metrics_dict['weighted avg']['recall']
        #print('recall avg (weighted):', avg_recall)
        recall.append(avg_recall)
    
        avg_accuracy = metrics_dict['accuracy']
        #print('accuracy: ', avg_accuracy)
        accuracy.append(avg_accuracy)
        
    print('\n******* Performance with k =', k, '*******')
    print('Precision: ', sum(precision)/len(precision))
    print('Recall: ', sum(recall)/len(recall))
    print('Accuracy: ', sum(accuracy)/len(accuracy))
    print('**************************************\n')
    
    plt.plot(k, np.mean(test_score), 'bo')
    plt.plot(k, np.mean(train_score), 'rx')
    

### K-Cross Validation (k = 3) on the orignal data

- KNN classifier with varying training size

In [None]:
t = [x/10 for x in range(1, 9)]

knn = KNeighborsClassifier(n_neighbors = 10)

plt.figure()

for s in t:
    train_score = []
    test_score = []
    for i in range(1,100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-s)
        knn.fit(X_train, y_train)
        train_score.append(knn.score(X_train, y_train))
        test_score.append(knn.score(X_test, y_test))
    
    plt.plot(s, np.mean(train_score), 'rx')
    plt.plot(s, np.mean(test_score), 'bo')

plt.xlabel('Training set proportion (%)')
plt.ylabel('accuracy');

### K-Cross Validation (k = 3) on the normalized data

- KNN classifier with varying k values, 1, 5, 10, 15, 20, 25, 30

In [None]:
kf = KFold(n_splits=3, random_state=None, shuffle=True) # Define the split - into k folds 

k_values = [1, 5, 10, 15, 20, 25, 30]

for k in k_values:
    precision = []
    recall = []
    accuracy = []
    
    knn = KNeighborsClassifier(n_neighbors = k)
    
    train_score = []
    test_score = []
    for train_index, test_index in kf.split(X_normalized):
        
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        
        train_score.append(knn.score(X_train, y_train))
        test_score.append(knn.score(X_test, y_test))
        
        #print(f"k={k}")
        #print("training score: ", knn.score(X_train, y_train))
        #print("testing score: ", knn.score(X_test, y_test))
    
        # plot a confusion matrix
        confusion_mat = confusion_matrix(y_test, y_pred)
        #print(confusion_mat)
    
        #classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
        results = classification_report(y_test, y_pred, target_names=target_names)
        #print(results)

        metrics_dict = classification_report(y_test, y_pred, 
                        target_names=target_names, output_dict=True)

        avg_precision = metrics_dict['weighted avg']['precision']
        #print('precision (weighted):', avg_precision)
        precision.append(avg_precision)
    
        avg_recall = metrics_dict['weighted avg']['recall']
        #print('recall avg (weighted):', avg_recall)
        recall.append(avg_recall)
    
        avg_accuracy = metrics_dict['accuracy']
        #print('accuracy: ', avg_accuracy)
        accuracy.append(avg_accuracy)
        
    print('\n******* Performance with k =', k, '*******')
    print('Precision: ', sum(precision)/len(precision))
    print('Recall: ', sum(recall)/len(recall))
    print('Accuracy: ', sum(accuracy)/len(accuracy))
    print('**************************************\n')
    
    plt.plot(k, np.mean(test_score), 'bo')
    plt.plot(k, np.mean(train_score), 'rx')
    

### K-Cross Validation (k = 3) on the orignal data¶

KNN classifier with varying training size

In [None]:
t = [x/10 for x in range(1, 9)]

knn = KNeighborsClassifier(n_neighbors = 5)

plt.figure()

for s in t:
    train_score = []
    test_score = []
    for i in range(1,100):
        X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size = 1-s)
        knn.fit(X_train, y_train)
        train_score.append(knn.score(X_train, y_train))
        test_score.append(knn.score(X_test, y_test))
    
    plt.plot(s, np.mean(train_score), 'rx')
    plt.plot(s, np.mean(test_score), 'bo')

plt.xlabel('Training set proportion (%)')
plt.ylabel('accuracy');

### Naive Bayesian Classifier

- The original data

- modeling with varying training data size

In [None]:
t = [x/10 for x in range(1, 9)]

nbclf1 = GaussianNB()

plt.figure()

for s in t:
    train_score = []
    test_score = []
    for i in range(1,100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-s)
        nbclf1.fit(X_train, y_train)
        train_score.append(nbclf1.score(X_train, y_train))
        test_score.append(nbclf1.score(X_test, y_test))
    
    plt.plot(s, np.mean(train_score), 'rx')
    plt.plot(s, np.mean(test_score), 'bo')

plt.xlabel('Training set proportion (%)')
plt.ylabel('accuracy');

#### Naive Bayesian classifier

- Normalized data

- Modeling with varying training data size

In [None]:
t = [x/10 for x in range(1, 9)]

nbclf2 = GaussianNB()

plt.figure()

for s in t:
    train_score = []
    test_score = []
    for i in range(1,100):
        X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size = 1-s)
        nbclf2.fit(X_train, y_train)
        train_score.append(nbclf2.score(X_train, y_train))
        test_score.append(nbclf2.score(X_test, y_test))
    
    plt.plot(s, np.mean(train_score), 'rx')
    plt.plot(s, np.mean(test_score), 'bo')

plt.xlabel('Training set proportion (%)')
plt.ylabel('accuracy (normalized)');