# K Nearest Neighbour Classifier

## 1. Perform the KNN on Sonar dataset with and without in-built functions for K=1, 3, 5, 7.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
import scipy.spatial

In [2]:
df = pd.read_csv("Data/sonar.csv")
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,Result
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [3]:
X = df[[column for column in df.columns if column != "Result"]]
y = df["Result"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20, test_size=0.2, shuffle=True)

### Using Built In functions

In [4]:
clf_1 = KNeighborsClassifier(n_neighbors=1, weights="distance")
clf_3 = KNeighborsClassifier(n_neighbors=3, weights="distance")
clf_5 = KNeighborsClassifier(n_neighbors=5, weights="distance")
clf_7 = KNeighborsClassifier(n_neighbors=7, weights="distance")
cv_results = cross_validate(clf_1, X, y, cv=3)
print("Test results for when K = 1: ", cv_results["test_score"].mean())
cv_results = cross_validate(clf_3, X, y, cv=3)
print("Test results for when K = 3: ", cv_results["test_score"].mean())
cv_results = cross_validate(clf_5, X, y, cv=3)
print("Test results for when K = 5: ", cv_results["test_score"].mean())
cv_results = cross_validate(clf_7, X, y, cv=3)
print("Test results for when K = 7: ", cv_results["test_score"].mean())

Test results for when K = 1:  0.4612836438923395
Test results for when K = 3:  0.48530020703933746
Test results for when K = 5:  0.4759144237405107
Test results for when K = 7:  0.45659075224292617


### Without using built in functions

In [5]:
class KNN:
    def __init__(self, k):
        self.k = k
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    def distance(self, X1, X2):
        distance = scipy.spatial.distance.euclidean(X1, X2)
    
    def predict(self, X_test):
        final_output = []
        for i in range(len(X_test)):
            d = []
            outputs = []
            for j in range(len(X_train)):
                dist = scipy.spatial.distance.euclidean(X_train.iloc[j] , X_test.iloc[i])
                d.append([dist, j])
            d.sort()
            d = d[0:self.k]
            for d, j in d:
                outputs.append(y_train.iloc[j])
            results = []
            ans = max(set(outputs), key = outputs.count)
            final_output.append(ans)
            
        return final_output
    
    def score(self, X_test, y_test):
        predictions = self.predict(X_test)
        return (predictions == y_test).sum() / len(y_test)

In [6]:
clf_1 = KNN(1)
clf_1.fit(X_train, y_train)
print("Accuracy score when K = 1: ", clf_1.score(X_test, y_test))
clf_3 = KNN(3)
clf_3.fit(X_train, y_train)
print("Accuracy score when K = 3: ", clf_3.score(X_test, y_test))
clf_5 = KNN(5)
clf_5.fit(X_train, y_train)
print("Accuracy score when K = 5: ", clf_5.score(X_test, y_test))
clf_7 = KNN(7)
clf_7.fit(X_train, y_train)
print("Accuracy score when K = 7: ", clf_7.score(X_test, y_test))

Accuracy score when K = 1:  0.7857142857142857
Accuracy score when K = 3:  0.7857142857142857
Accuracy score when K = 5:  0.7142857142857143
Accuracy score when K = 7:  0.6666666666666666


## 2. Break the 60 features into 6 subsets having 10 features each. Perform the KNN and then compare the performance.

In [7]:
x1 = X["0"] + X["1"] + X["2"] + X["3"] + X["4"] + X["5"] + X["6"] + X["7"] + X["8"] + X["9"]
x2 = X["10"] + X["11"] + X["12"] + X["13"] + X["14"] + X["15"] + X["16"] + X["17"] + X["18"] + X["19"]
x3 = X["20"] + X["21"] + X["22"] + X["23"] + X["24"] + X["25"] + X["26"] + X["27"] + X["28"] + X["29"]
x4 = X["30"] + X["31"] + X["32"] + X["33"] + X["34"] + X["35"] + X["36"] + X["37"] + X["38"] + X["39"]
x5 = X["40"] + X["41"] + X["42"] + X["43"] + X["44"] + X["45"] + X["46"] + X["47"] + X["48"] + X["49"]
x6 = X["50"] + X["51"] + X["52"] + X["53"] + X["54"] + X["55"] + X["56"] + X["57"] + X["58"] + X["59"]
X_new = np.array([x1, x2, x3, x4, x5, x6]).T
X_new = pd.DataFrame(X_new)
X_new.head()

Unnamed: 0,0,1,2,3,4,5
0,1.1506,2.4981,5.969,5.4099,1.7553,0.1108
1,1.812,7.781,3.6074,2.525,0.7852,0.0916
2,2.4274,6.9525,5.8549,4.8369,1.62,0.1603
3,0.5908,2.0185,3.8732,6.3428,3.0694,0.0957
4,1.5241,4.6516,5.3996,3.3635,0.9049,0.0792


In [8]:
clf_1 = KNeighborsClassifier(n_neighbors=1, weights="distance")
clf_3 = KNeighborsClassifier(n_neighbors=3, weights="distance")
clf_5 = KNeighborsClassifier(n_neighbors=5, weights="distance")
clf_7 = KNeighborsClassifier(n_neighbors=7, weights="distance")
cv_results = cross_validate(clf_1, X_new, y, cv=3)
print("Test results for when K = 1: ", cv_results["test_score"].mean())
cv_results = cross_validate(clf_3, X_new, y, cv=3)
print("Test results for when K = 3: ", cv_results["test_score"].mean())
cv_results = cross_validate(clf_5, X_new, y, cv=3)
print("Test results for when K = 5: ", cv_results["test_score"].mean())
cv_results = cross_validate(clf_7, X_new, y, cv=3)
print("Test results for when K = 7: ", cv_results["test_score"].mean())

Test results for when K = 1:  0.485576259489303
Test results for when K = 3:  0.4761214630779848
Test results for when K = 5:  0.45721187025534854
Test results for when K = 7:  0.45231193926846097


> **Comparison**

By adding consecutive 10 features each into a new feature, increased accuracy in some cases and also decreased in some cases so we can say that sometimes it maybe useful sometimes it will bias the values.

## 3. Divide them into 6 subsets and perform the classification on each subset. Then perform the majority voting for classification.

In [9]:
feature_sets = [x1, x2, x3, x4, x5, x6]
for i in range(8):
    if(i%2!=0):
        comps = []
        for index, feature_set in enumerate(feature_sets):
            clf = KNeighborsClassifier(n_neighbors=i, weights="distance")
            cv_results = cross_validate(clf, pd.DataFrame(feature_set), y, cv=3)
            score = cv_results["test_score"].mean()
            print(f"Test results for when K = {i} and for feature {index + 1} : ", score)
            comps.append([index, score])
        maxima = max(comps, key = lambda x: x[1])
        print(f"Majority voting is given for feature: {maxima[0] + 1}, and score is: {maxima[1]}")

Test results for when K = 1 and for feature 1 :  0.6008971704623879
Test results for when K = 1 and for feature 2 :  0.4954451345755693
Test results for when K = 1 and for feature 3 :  0.4707384403036577
Test results for when K = 1 and for feature 4 :  0.4562456866804692
Test results for when K = 1 and for feature 5 :  0.5049689440993789
Test results for when K = 1 and for feature 6 :  0.45224292615596956
Majority voting is given for feature: 1, and score is: 0.6008971704623879
Test results for when K = 3 and for feature 1 :  0.6345065562456867
Test results for when K = 3 and for feature 2 :  0.49558316080055215
Test results for when K = 3 and for feature 3 :  0.4707384403036577
Test results for when K = 3 and for feature 4 :  0.4612836438923395
Test results for when K = 3 and for feature 5 :  0.5193926846100759
Test results for when K = 3 and for feature 6 :  0.456935817805383
Majority voting is given for feature: 1, and score is: 0.6345065562456867
Test results for when K = 5 and for

## 4.  Also, perform the same tasks for 5 subsets.

In [10]:
x1 = X["0"] + X["1"] + X["2"] + X["3"] + X["4"] + X["5"] + X["6"] + X["7"] + X["8"] + X["9"] + X["10"] + X["11"]
x2 = X["12"] + X["13"] + X["14"] + X["15"] + X["16"] + X["17"] + X["18"] + X["19"] + X["20"] + X["21"] + X["22"] + X["23"]
x3 = X["24"] + X["25"] + X["26"] + X["27"] + X["28"] + X["29"] + X["30"] + X["31"] + X["32"] + X["33"] + X["34"] + X["35"]
x4 = X["36"] + X["37"] + X["38"] + X["39"] + X["40"] + X["41"] + X["42"] + X["43"] + X["44"] + X["45"] + X["46"] + X["47"]
x5 = X["48"] + X["49"] + X["50"] + X["51"] + X["52"] + X["53"] + X["54"] + X["55"] + X["56"] + X["57"] + X["58"] + X["59"]
X_new = np.array([x1, x2, x3, x4, x5]).T
X_new = pd.DataFrame(X_new)
X_new.head()

Unnamed: 0,0,1,2,3,4
0,1.4697,4.2522,7.2581,3.7322,0.1815
1,2.959,8.3475,3.5487,1.6084,0.1386
2,3.7667,7.8784,6.13,3.893,0.1839
3,0.8781,3.4145,5.2081,6.2965,0.1932
4,2.3345,5.7981,5.4983,2.1852,0.1068


> **KNN and Comparison**

In [11]:
clf_1 = KNeighborsClassifier(n_neighbors=1, weights="distance")
clf_3 = KNeighborsClassifier(n_neighbors=3, weights="distance")
clf_5 = KNeighborsClassifier(n_neighbors=5, weights="distance")
clf_7 = KNeighborsClassifier(n_neighbors=7, weights="distance")
cv_results = cross_validate(clf_1, X_new, y, cv=3)
print("Test results for when K = 1: ", cv_results["test_score"].mean())
cv_results = cross_validate(clf_3, X_new, y, cv=3)
print("Test results for when K = 3: ", cv_results["test_score"].mean())
cv_results = cross_validate(clf_5, X_new, y, cv=3)
print("Test results for when K = 5: ", cv_results["test_score"].mean())
cv_results = cross_validate(clf_7, X_new, y, cv=3)
print("Test results for when K = 7: ", cv_results["test_score"].mean())

Test results for when K = 1:  0.39420289855072466
Test results for when K = 3:  0.39910282953761217
Test results for when K = 5:  0.394271911663216
Test results for when K = 7:  0.3800552104899931


> **Comparison**

By adding consecutive 12 features each into a new feature and forming 5 subsets, accuracy score is decreased when compared to 10 features and 6 subsets.

> **Majority voting for classification**

In [12]:
feature_sets = [x1, x2, x3, x4, x5]
for i in range(8):
    if(i%2!=0):
        comps = []
        for index, feature_set in enumerate(feature_sets):
            clf = KNeighborsClassifier(n_neighbors=i, weights="distance")
            cv_results = cross_validate(clf, pd.DataFrame(feature_set), y, cv=3)
            score = cv_results["test_score"].mean()
            print(f"Test results for when K = {i} and for feature {index + 1} : ", score)
            comps.append([index, score])
        maxima = max(comps, key = lambda x: x[1])
        print(f"Majority voting is given for feature: {maxima[0] + 1}, and score is: {maxima[1]}")

Test results for when K = 1 and for feature 1 :  0.6253968253968254
Test results for when K = 1 and for feature 2 :  0.3510006901311249
Test results for when K = 1 and for feature 3 :  0.5336093857832989
Test results for when K = 1 and for feature 4 :  0.4810904071773637
Test results for when K = 1 and for feature 5 :  0.5429261559696342
Majority voting is given for feature: 1, and score is: 0.6253968253968254
Test results for when K = 3 and for feature 1 :  0.6398895790200138
Test results for when K = 3 and for feature 2 :  0.37991718426501037
Test results for when K = 3 and for feature 3 :  0.5674258109040718
Test results for when K = 3 and for feature 4 :  0.48102139406487227
Test results for when K = 3 and for feature 5 :  0.5574189095928227
Majority voting is given for feature: 1, and score is: 0.6398895790200138
Test results for when K = 5 and for feature 1 :  0.6447895100069013
Test results for when K = 5 and for feature 2 :  0.3751552795031056
Test results for when K = 5 and fo