## Home task : KNN_iris_dataset

In [1]:
import pandas as pd 
import numpy as np
import math

from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

from collections import Counter

from sklearn.datasets import load_iris
# import iris dataset
iris = datasets.load_iris()
# np.c_ is the numpy concatenate function
df_iris = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                      columns= iris['feature_names'] + ['target'])
df_iris.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


DataFrame before shuffe

In [2]:
df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


DataFrame after shuffe

In [3]:
df_iris = df_iris.sample(frac = 1, random_state=3)
df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
47,4.6,3.2,1.4,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
31,5.4,3.4,1.5,0.4,0.0
25,5.0,3.0,1.6,0.2,0.0
15,5.7,4.4,1.5,0.4,0.0


Split into X and Y

In [4]:
X= df_iris.iloc[:, :-1]
y= df_iris.iloc[:, -1]

Split the data into train and test sets

In [5]:
n_train = math.floor(0.8 * X.shape[0])
n_test = math.ceil((0.2) * X.shape[0])
X_train = X[:n_train]
y_train = y[:n_train]
X_test = X[n_train:]
y_test = y[n_train:]

X_train= np.asarray(X_train)
y_train= np.asarray(y_train)
X_test= np.asarray(X_test)
y_test= np.asarray(y_test)

print("Total Number of rows in train:",X_train.shape[0])
print("Total Number of rows in test:",X_test.shape[0])

Total Number of rows in train: 120
Total Number of rows in test: 30


  y_train = y[:n_train]
  y_test = y[n_train:]


Normalize the Datase

In [6]:
scaler= Normalizer().fit(X_train) # the scaler is fitted to the training set
normalized_X_train= scaler.transform(X_train) # the scaler is applied to the training set
normalized_X_test= scaler.transform(X_test) # the scaler is applied to the test set

In [7]:
print('X train before Normalization')
print(X_train[0:5])
print('\nX train after Normalization')
print(normalized_X_train[0:5])

X train before Normalization
[[4.6 3.2 1.4 0.2]
 [4.6 3.1 1.5 0.2]
 [5.4 3.4 1.5 0.4]
 [5.  3.  1.6 0.2]
 [5.7 4.4 1.5 0.4]]

X train after Normalization
[[0.79594782 0.55370283 0.24224499 0.03460643]
 [0.80003025 0.53915082 0.26087943 0.03478392]
 [0.82225028 0.51771314 0.22840286 0.06090743]
 [0.82647451 0.4958847  0.26447184 0.03305898]
 [0.77381111 0.59732787 0.2036345  0.05430253]]


### Custom implementation of KNN

Step 1 (Euclidean Distance)

In [8]:
def distance_ecu(X_train, X_test_point):
 
  distances= []  ## create empty list called distances
  for row in range(len(X_train)): ## Loop over the rows of x_train
      current_train_point= X_train[row] #Get them point by point
      current_distance= 0 ## initialize the distance by zero

      for col in range(len(current_train_point)): ## Loop over the columns of the row
          
          current_distance += (current_train_point[col] - X_test_point[col]) **2
          ## Or current_distance = current_distance + (x_train[i] - x_test_point[i])**2
      current_distance= np.sqrt(current_distance)

      distances.append(current_distance) ## Append the distances

  # Store distances in a dataframe
  distances= pd.DataFrame(data=distances,columns=['dist'])
  return distances # the distances between the test point and each point in the training data

Step 2 (Find the nearest neighbors)

In [9]:
def nearest_neighbors(distance_point, K):
    
    # Sort values using the sort_values function
    df_nearest= distance_point.sort_values(by=['dist'], axis=0)

    ## Take only the first K neighbors
    df_nearest= df_nearest[:K]
    return df_nearest # the nearest K neighbors between the test point and the training data

Step 3 (Classify the point based on a majority vote)

In [10]:
def voting(df_nearest, y_train):
   
    ## Use the Counter Object to get the labels with K nearest neighbors.
    counter_vote= Counter(y_train[df_nearest.index])

    y_pred= counter_vote.most_common()[0][0]   # Majority Voting

    return y_pred # the prediction based on Majority Voting

Full Algorithm: Putting Everything Together

In [11]:
def KNN_from_scratch(X_train, y_train, X_test, K):

    y_pred=[]
    ## Loop over all the test set and perform the three steps
    for X_test_point in X_test:
      distance_point  = distance_ecu(X_train, X_test_point)  ## Step 1
      df_nearest_point= nearest_neighbors(distance_point, K)  ## Step 2
      y_pred_point    = voting(df_nearest_point, y_train) ## Step 3
      y_pred.append(y_pred_point)

    return y_pred  # the prediction for the whole test set based on Majority Voting

Test the KNN Algorithm on the test dataset

In [12]:
K=1
y_pred_scratch= KNN_from_scratch(normalized_X_train, y_train, normalized_X_test, K)
print(y_pred_scratch)

[0.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0, 2.0, 0.0, 1.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 2.0, 2.0, 2.0, 0.0, 0.0, 1.0, 2.0]


Compare our implementation with Sklearn library

In [13]:
knn=KNeighborsClassifier(K)
knn.fit(normalized_X_train, y_train)
y_pred_sklearn= knn.predict(normalized_X_test)
print(y_pred_sklearn)

[0. 0. 0. 1. 2. 0. 0. 2. 0. 1. 2. 1. 2. 2. 1. 1. 2. 1. 0. 1. 1. 0. 1. 2.
 2. 2. 0. 0. 1. 2.]


Calculate the accuracy of both methods

In [14]:
print(f'The accuracy of our implementation is {accuracy_score(y_test, y_pred_scratch)}')
print(f'The accuracy of sklearn implementation is {accuracy_score(y_test, y_pred_sklearn)}')

The accuracy of our implementation is 0.9666666666666667
The accuracy of sklearn implementation is 0.9666666666666667


Perform Hyper-parameter Tuning using K-fold Cross Validation

In [15]:
n_splits= 4 ## Choose the number of splits
kf= KFold(n_splits= n_splits) ## Call the K Fold function

accuracy_k= [] ## Keep track of the accuracy for each K
k_values= list(range(1,32,2)) ## Search for the best value of K

for k in k_values: ## Loop over the K values
  accuracy_fold= 0
  for normalized_X_train_fold_idx, normalized_X_valid_fold_idx in  kf.split(normalized_X_train): ## Loop over the splits
      normalized_X_train_fold= normalized_X_train[normalized_X_train_fold_idx] ## fetch the values
      y_train_fold= y_train[normalized_X_train_fold_idx]

      normalized_X_test_fold= normalized_X_train[normalized_X_valid_fold_idx]
      y_valid_fold= y_train[normalized_X_valid_fold_idx]
      y_pred_fold= KNN_from_scratch(normalized_X_train_fold, y_train_fold, normalized_X_test_fold, k)

      accuracy_fold+= accuracy_score (y_pred_fold, y_valid_fold) ## Accumulate the accuracy
  accuracy_fold= accuracy_fold/ n_splits ## Divide by the number of splits
  accuracy_k.append(accuracy_fold)

In [16]:
print(f'The accuracy for each K value was {list ( zip (accuracy_k, k_values))}') ## creates a tuple with accuracy corresponding to k value

The accuracy for each K value was [(0.9666666666666667, 1), (0.975, 3), (0.9666666666666667, 5), (0.9666666666666667, 7), (0.9666666666666667, 9), (0.9666666666666667, 11), (0.9666666666666667, 13), (0.9666666666666667, 15), (0.9666666666666667, 17), (0.9666666666666667, 19), (0.9583333333333333, 21), (0.9666666666666667, 23), (0.9666666666666667, 25), (0.975, 27), (0.9666666666666667, 29), (0.9666666666666667, 31)]


### Result

In [17]:
k_best = k_values[np.argmax(accuracy_k)] # 'compute the best k'
score_best = np.max(accuracy_k) #'compute the best score'
print ('The best k = {} , score = {}'.format(k_best,score_best ))

The best k = 3 , score = 0.975
