# K-nearest neighbors classification

![K-Nearest neighboors](https://static.javatpoint.com/tutorial/machine-learning/images/k-nearest-neighbor-algorithm-for-machine-learning3.png)

In [1]:
# Loading the dataset
import pandas as pd

names = ["Sequence Name", "mcg", "gvh", "lip", "chg", "aac", "alm1", "alm2", "location"]
ecoli = pd.read_csv("data/ecoli.data",names=names)
ecoli.drop(["Sequence Name"], axis=1, inplace = True) # drop sequence name as it does not contain generalizable information for modeling

# drop ...
ecoli.drop(ecoli[(ecoli["location"]=="om") | (ecoli["location"]=="omL") | (ecoli["location"]=="imL") | (ecoli["location"]=="imS") ].index, axis=0, inplace= True)
ecoli.head()

Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2,location
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,0.07,0.4,0.48,0.5,0.54,0.35,0.44,cp
2,0.56,0.4,0.48,0.5,0.49,0.37,0.46,cp
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp


In [2]:
import matplotlib.pyplot as plt
# replace categorical data into quantitative (encoding)
## (location varies when alm1 varies)
cleanup_nums = {"location": {"cp": 0, "im": 1,  "pp": 2 , "imU": 3}}
ecoli = ecoli.replace(cleanup_nums)

## KNN algorithm

In [6]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score , confusion_matrix, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split

In [42]:
X = ecoli.drop(["location"], axis=1) # features
y = ecoli["location"] # target
model = KNeighborsClassifier(n_neighbors=1)
# Train the model using the training set
model.fit(X,y)
# Predict Output
y_model = model.predict(X)
accuracy_score(y,y_model) 

1.0

### KNN Validation with Hold-out
Hold-out is when you split up your dataset into a **train** and **test** set. The training set is what the model is trained on, and the test set is used to see how well that model performs on unseen data. A common split when using the hold-out method is using `70%` of data for training and the remaining `30%` of the data for testing.



In [46]:
import numpy as np
# Hold out
X = ecoli.drop(["location"], axis=1) # features
y = ecoli["location"] # target
model2 = KNeighborsClassifier(n_neighbors = 5)
X1, X2, y1, y2 = train_test_split(X, y, random_state= np.random, train_size=0.7) # 70% training and 30% test
y_pred = model.fit(X1, y1).predict(X2)
y2_model = model.predict(X2)
accuracy_score(y2, y2_model)
## -> Après executer ce code plusieurs fois on obtient des résultats differents et ça est dut du fait que Train1 # Train2

0.8494623655913979

### KNN Validation with Cross-validation
Cross-validation or *k-fold cross-validation* is when the dataset is randomly split up into **k** groups. One of the groups is used as the test set and the rest are used as the training set. The model is trained on the training set and scored on the test set. Then the process is repeated until each unique group has been used as the test set.

In [39]:
# cross-val
# 5-fold
from sklearn.model_selection import cross_val_score
cross_val_score(model, X, y, cv=5) 
## -> We get always the same result

array([0.90322581, 0.90322581, 0.70491803, 0.81967213, 0.75409836])

### KNN Validation with LOOCV
Split a dataset into a training set and a testing set, using all but one observation as part of the training set

In [47]:
    # LOOCV ((Leave One Out Cross-Validation))
    from sklearn.model_selection import LeaveOneOut
    scores = cross_val_score(model, X, y, cv =LeaveOneOut() )
    scores.mean() 
    print(scores)
    print(scores.mean())
    ## -> 

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1.
 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1.
 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1.
 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1.]
0.8273615635179153


In [59]:
# Model evaluation
X1, X2, y1, y2 = train_test_split(X,y, random_state= np.random, train_size=0.7)
y_pred=model2.fit(X1,y1).predict(X2)
y2_model=model2.predict(X2)
cm1= confusion_matrix(y2, y2_model)
print("Confusion Matrix: \n", cm1, "\n")
print(accuracy_score(y2, y2_model))
print(precision_score(y2, y2_model, average='weighted'))
print(recall_score(y2, y2_model, average='macro'))
print("\nAccuracy:", accuracy_score(y2, y2_model))
print("\nClassification report :\n\n", classification_report(y2, y2_model)) 

Confusion Matrix: 
 [[49  0  0  0]
 [ 1 18  0  4]
 [ 1  0 15  0]
 [ 0  1  0  4]] 

0.9247311827956989
0.9394398393199951
0.8800271739130434

Accuracy: 0.9247311827956989

Classification report :

               precision    recall  f1-score   support

           0       0.96      1.00      0.98        49
           1       0.95      0.78      0.86        23
           2       1.00      0.94      0.97        16
           3       0.50      0.80      0.62         5

    accuracy                           0.92        93
   macro avg       0.85      0.88      0.86        93
weighted avg       0.94      0.92      0.93        93



In [60]:
# Optimal parameters: Grid Search
# Grid search
from sklearn.model_selection import GridSearchCV
#creat a new KNN model
Knn2 = KNeighborsClassifier()
K_range = list(range(1,32))
pram_grid = dict(n_neighbors = K_range)
grid = GridSearchCV(Knn2, pram_grid, cv = 10, scoring = 'accuracy')
grid.fit(X,y)
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)
## -> Optimal parameters using LOOCV will be n_neighbors=13

0.8726881720430107
{'n_neighbors': 13}
KNeighborsClassifier(n_neighbors=13)


### Model Evaluation according to the four metrics using the three validation methods

|           	| Hold-out 	| CV 5-folds 	| LOOCV 	|
|:-----------:	|:----------:	|:--------------------------:	|:-------:	|
| Accuracy  	| 85%      	| 90%                      	| 83%   	|
| Precision 	| 98%      	|                          	|       	|
| Recall    	| 98%      	|                          	|       	|
| F1-score  	| 98%      	|                          	|       	|

