# KNN Classifier

## Importing and Loading data

In [39]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

import pandas as pd
import numpy as np

import time

In [2]:
df = pd.read_csv(r'\Users\KAYEG\Documents\ASU\FALL - 21 - Term 7\Machine Learning\Project\Project\data.csv', index_col=0)
df.head()

Unnamed: 0,location,country,gender,age,vis_wuhan,from_wuhan,symptom1,symptom2,symptom3,symptom4,symptom5,symptom6,diff_sym_hos,result
0,104,8,1,66.0,1,0,14,31,19,12,3,1,8,1
1,101,8,0,56.0,0,1,14,31,19,12,3,1,0,0
2,137,8,1,46.0,0,1,14,31,19,12,3,1,13,0
3,116,8,0,60.0,1,0,14,31,19,12,3,1,0,0
4,116,8,1,58.0,0,0,14,31,19,12,3,1,0,0


# Splitting Data

## Features - Label Split

In [3]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [4]:
print(X.shape, y.shape)

(863, 13) (863,)


### Train - Validation - Test Split

In [5]:
# 60% 20% 20% split is applied
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5)
print('\nX Train', x_train.shape, 'Y Train', y_train.shape, '\nX Test', x_test.shape, 'Y Test', y_test.shape,
      '\nX Val', x_val.shape, 'Y Val', y_val.shape)

(517, 13) (346, 13) (517,) (346,)

X Train (517, 13) Y Train (517,) 
X Test (173, 13) Y Test (173,) 
X Val (173, 13) Y Val (173,)


In [6]:
knn_class = KNeighborsClassifier(n_neighbors=3)
knn_class.fit(x_train, y_train)
y_predict = knn_class.predict(x_test)

In [7]:
sk_acc_score = accuracy_score(y_test, y_predict) * 100
sk_acc_score

92.48554913294798

In [8]:
precision_recall_fscore_support(y_test, y_predict)

(array([0.9602649 , 0.68181818]),
 array([0.95394737, 0.71428571]),
 array([0.95709571, 0.69767442]),
 array([152,  21], dtype=int64))

# Implementing KNN

## Concept

- Calculate distance from point to point --> distance between target and point
    - target = [x, y, z]
    - points = [a, b, c]
    
    - squared = [(x - a)^2, (y - b)^2, (z - c)^2]
    - sum_arr = sum(squared) == (x - a)^2 + (y - b)^2 + (z - c)^2
    - distance = sqrt(sum_arr)
    
    
- Repeat with all features --> array of distances


- Sort ascendingly --> sorted array of distances


- Take first K distances --> array of K distances


- Predict based on majority --> prediction 

In [14]:
class KNN_Classifier:
    
    def __init__(self, k, power):
        self.k = k
        self.power = power
    
    
    def save_data(self, features, labels):
        self.features = np.array(features)
        self.labels = np.array(labels)
        
    
    def get_distance_from_point(self, target, point):
        squared_difference = np.power(target - point, self.power)
        sum_squared_difference = np.sum(squared_difference)
        distance = np.sqrt(sum_squared_difference)
        
        return distance
    
    
    def get_distances(self, target):
        target = np.array(target)
        distances = []
        
        for point, label in zip(self.features, self.labels):
            distances.append([self.get_distance_from_point(target, point), label])
        
        return np.array(distances)
            
    
    def majority_vote(self, distances):
        k_nearest_labels = distances[distances[:, 0].argsort()][:self.k, -1]
        unique_labels, frequency = np.unique(k_nearest_labels, return_counts=True)
        max_frequency = np.max(frequency)
        mode = []
        
        for i in range(len(frequency)):
            if (frequency[i] == max_frequency):
                mode.append(unique_labels[i])

        return np.array(mode)

    
    
    def predict(self, targets):
        targets = np.array(targets)
        predictions = []
        
        for i, target in enumerate(targets):
            distances = self.get_distances(target)
            prediction = self.majority_vote(distances)
            predictions.append(prediction)
        
        return np.array(predictions)
        
        

In [18]:
knn = KNN_Classifier(3, 2)
knn.save_data(x_train, y_train)
# print(x_train[170:175], '\n', y_train[170:175])
# print(knn.features, '\n', knn.labels)
# print(x_test[:5])
my_predictions = knn.predict(x_test)
my_acc_score = accuracy_score(y_test, my_predictions) * 100
my_acc_score

91.90751445086705

In [16]:
sk_acc_score - my_acc_score

0.5780346820809257

In [52]:
acc = []
maxx = 0
max_k = -1
start = time.time()
for k in range(1, 100, 2):
    knn_class = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
    knn_class.fit(x_train, y_train)
    y_predict = knn_class.predict(x_test)
    
    sk_acc_score = accuracy_score(y_test, y_predict) * 100
    acc.append(sk_acc_score)
    if(sk_acc_score > maxx):
        maxx = sk_acc_score
        max_k = k
end = time.time()
print(f"Execution time {end - start}")

Execution time 0.3508121967315674


In [47]:
max_k

9

In [48]:
acc

[91.90751445086705,
 92.48554913294798,
 92.48554913294798,
 92.48554913294798,
 93.64161849710982,
 92.48554913294798,
 92.48554913294798,
 93.64161849710982,
 93.0635838150289,
 93.0635838150289,
 93.64161849710982,
 93.0635838150289,
 93.0635838150289,
 92.48554913294798,
 92.48554913294798,
 92.48554913294798,
 92.48554913294798,
 91.90751445086705,
 92.48554913294798,
 92.48554913294798,
 90.7514450867052,
 89.59537572254335,
 89.01734104046243,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057]

In [51]:
acc = []
maxx = 0
max_k = -1

start = time.time()

for k in range(1, 100, 2):
    knn = KNN_Classifier(k, 2)
    knn.save_data(x_train, y_train)
    my_predictions = knn.predict(x_test)
    my_acc_score = accuracy_score(y_test, my_predictions) * 100
    acc.append(my_acc_score)
    if(my_acc_score > maxx):
        maxx = my_acc_score
        max_k = k

end = time.time()
print(f"Execution time {end - start}")

Execution time 41.98218035697937


In [41]:
max_k

9

In [42]:
acc

[91.90751445086705,
 91.90751445086705,
 92.48554913294798,
 92.48554913294798,
 93.64161849710982,
 92.48554913294798,
 92.48554913294798,
 93.64161849710982,
 93.0635838150289,
 93.0635838150289,
 93.64161849710982,
 93.0635838150289,
 93.0635838150289,
 92.48554913294798,
 92.48554913294798,
 92.48554913294798,
 92.48554913294798,
 91.90751445086705,
 92.48554913294798,
 92.48554913294798,
 91.32947976878613,
 89.59537572254335,
 89.01734104046243,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057,
 87.86127167630057]