## K Nearest Neighbours 

In [3]:
import numpy as np

#### _*The code below is my implementation of KNN. I will run it on both datasets(Iris and ionosphere) and compute the accuracy. With K values 1, 3 and 5 to find the best K value, between these 3 values.*_

In [4]:
class KNearestNeighbour:
    # K number of Neighbours
    # We can change the value of K for example we can have K = 1 or K = 3 or K = 5 and so on.
    def __init__(self, k):
        self.k = k
        
    # Same as the fit fucntion in scikit-learn
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    # An important function in our algroithm that calculates the distance between points in the dataset
    def euclidean_dist(self, x1,x2):
        dif_between_points = x1-x2 
        dif_sqaured = dif_between_points**2
        dist = np.sqrt(np.sum(dif_sqaured))
        return dist
    
    # This fucntion populates an array with all the Eculidiean distances which will be sorted later
    def populate(self, X_train, X, i):
        distances = []
        for j in self.X_train:
            distance = self.euclidean_dist(j, X[i])
            distances.append(distance)
        
        return distances
                
    
    def neighbor_counter(self, k_neighbour):
        count_dict = {}
        for label in k_neighbour:
            if self.y_train[label] in count_dict:
                count_dict[self.y_train[label]] += 1
            else:
                count_dict[self.y_train[label]] = 1
        return count_dict
    
    # The other important function in this class that predicts the lable for the given X
    def pred(self, X):
        predictions = []
        
        for i in range(len(X)):
            euclidean_distances = self.populate(self.X_train, X, i)
                
            # Sorts the distances(ascending order) & only keeps the specidifed K Neighbours
            k_neighbour = np.array(euclidean_distances).argsort()[: self.k]
            
            count = self.neighbor_counter(k_neighbour)      
            predictions.append(max(count, key = count.get))
            
        
        return predictions

### First dataset:

In [5]:
from sklearn.datasets import load_iris
iris = load_iris()
from sklearn.model_selection import train_test_split
ionosphere = np.genfromtxt("ionosphere.txt", delimiter=",") #importing the dataset with txt format

In [6]:
X_train, X_test, y_train, y_test = train_test_split(iris['data'],
iris['target'], random_state=2111) #random state in Bday format DDMM given in assignment sheet

### K = 1
### _This means the model made the correct prediction for 92% of the irises in the test set._

### Test error rate is **1 - Accuracy**

In [7]:
KNN = KNearestNeighbour(k = 1)
KNN.fit(X_train, y_train)
predict = KNN.pred(X_test)
y_pred = KNN.pred(X_test)
accuracy = np.mean(y_pred == y_test)

print("Test set accuracy for k = 1 is " + str(accuracy))
test_err_rate = 1 - accuracy
# Rounded up by 3 decimal points
print("Test error rate for k = 1 is " + str(round(test_err_rate, 3)))

Test set accuracy for k = 1 is 0.9210526315789473
Test error rate for k = 1 is 0.079


### K = 3
### _This means the model made the correct prediction for 94% of the irises in the test set._

### Test error rate is **1 - Accuracy**

In [8]:
KNN = KNearestNeighbour(k = 3)
KNN.fit(X_train, y_train)
predict = KNN.pred(X_test)
y_pred = KNN.pred(X_test)
accuracy = np.mean(y_pred == y_test)

print("Test set accuracy for k = 3 is " + str(accuracy))
test_err_rate = 1 - accuracy
print("Test error rate for k = 3 is " + str(round(test_err_rate, 3)))

Test set accuracy for k = 3 is 0.9473684210526315
Test error rate for k = 3 is 0.053


### K = 5
### _This means the model made the correct prediction for 94% of the irises in the test set._

### Test error rate is **1 - Accuracy**

In [9]:
KNN = KNearestNeighbour(k = 5)
KNN.fit(X_train, y_train)
predict = KNN.pred(X_test)
y_pred = KNN.pred(X_test)
accuracy = np.mean(y_pred == y_test)

print("Test set accuracy for k = 5 is " + str(accuracy))
test_err_rate = 1 - accuracy
print("Test error rate for k = 5 is " + str(round(test_err_rate, 3)))

Test set accuracy for k = 5 is 0.9473684210526315
Test error rate for k = 5 is 0.053


#### Important infomration from the cells above:
#### _K values 3 and 5 are the best since they have the lowest error rate_

### Calculating the predicted labels for all test samples and comparing them with the true labels for the test samples:

#### K = 5

In [10]:
# not needed according to assignment sheet but useful for testing and actually seeing some data.
for i in range(len(predict)):
    print("Predicted: " + str(predict[i]) + " True label: " + str(y_test[i]))

Predicted: 1 True label: 1
Predicted: 2 True label: 2
Predicted: 0 True label: 0
Predicted: 0 True label: 0
Predicted: 0 True label: 0
Predicted: 0 True label: 0
Predicted: 1 True label: 1
Predicted: 2 True label: 2
Predicted: 2 True label: 1
Predicted: 2 True label: 2
Predicted: 0 True label: 0
Predicted: 1 True label: 1
Predicted: 0 True label: 0
Predicted: 0 True label: 0
Predicted: 2 True label: 2
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: 0 True label: 0
Predicted: 0 True label: 0
Predicted: 2 True label: 2
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: 0 True label: 0
Predicted: 0 True label: 0
Predicted: 2 True label: 2
Predicted: 1 True label: 1
Predicted: 2 True label: 1
Predicted: 0 True label: 0
Predicted: 0 True label: 0
Predicted: 0 True label: 0
Predicted: 2 True label: 2
Predicted: 0 True label: 0
Predicted: 2 True label: 2
Predicted: 2 True label: 2
Predicted: 1 True label: 1
Predicted: 2 True label: 2
Predicted: 1 True label: 1
P

### Second dataset:

In [11]:
X = np.genfromtxt("ionosphere.txt", delimiter=",",usecols=np.arange(34))
y = np.genfromtxt("ionosphere.txt", delimiter=",",usecols=34, dtype='int')

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2102)

### K = 1
### _This means the model made the correct prediction 89% of the time._

### Test error rate is **1 - Accuracy**

In [13]:
KNN = KNearestNeighbour(k = 1)
KNN.fit(X_train, y_train)
predict = KNN.pred(X_test)
y_pred = KNN.pred(X_test)
accuracy = np.mean(y_pred == y_test)

print("Test set accuracy for k = 1 is " + str(accuracy))
test_err_rate = 1 - accuracy
print("Test error rate for k = 1 is " + str(round(test_err_rate, 3)))

Test set accuracy for k = 1 is 0.8977272727272727
Test error rate for k = 1 is 0.102


### K = 3
### _This means the model made the correct prediction 90% of the time._

### Test error rate is **1 - Accuracy**

In [14]:
KNN = KNearestNeighbour(k = 3)
KNN.fit(X_train, y_train)
predict = KNN.pred(X_test)
y_pred = KNN.pred(X_test)
accuracy = np.mean(y_pred == y_test)

print("Test set accuracy for k = 3 is " + str(accuracy))
test_err_rate = 1 - accuracy
print("Test error rate for k = 3 is " + str(round(test_err_rate, 3)))

Test set accuracy for k = 3 is 0.9090909090909091
Test error rate for k = 3 is 0.091


### K = 5
### _This means the model made the correct prediction 87% of the time._

### Test error rate is **1 - Accuracy**

In [15]:
KNN = KNearestNeighbour(k = 5)
KNN.fit(X_train, y_train)
predict = KNN.pred(X_test)
y_pred = KNN.pred(X_test)
accuracy = np.mean(y_pred == y_test)

print("Test set accuracy for k = 5 is " + str(accuracy))
test_err_rate = 1 - accuracy
print("Test error rate for k = 5 is " + str(round(test_err_rate, 3)))

Test set accuracy for k = 5 is 0.875
Test error rate for k = 5 is 0.125


#### _K value 3 is our best K because it has the lowest error rate, in other words, the highest accuracy._

### Calculating the predicted labels for all test samples and comparing them with the true labels for the test samples:

#### K = 5

In [16]:
for i in range(len(predict)):
    print("Predicted: " + str(predict[i]) + " True label: " + str(y_test[i]))

Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: -1 True label: -1
Predicted: 1 True label: -1
Predicted: 1 True label: -1
Predicted: -1 True label: -1
Predicted: 1 True label: -1
Predicted: -1 True label: -1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: -1 True label: -1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: -1 True label: -1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: 1 True label: -1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: -1 True label: -1
Predicted: 1 True label: 1
Predicted: -1 True label: -1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: -1 True label: -1
Predicted: -1 True label: -1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predicted: 1 True label: 1
Predic