# K Nearest Neighbors
Implimentation from scratch
*Dataset link: https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/*

#### Importing packages

In [1]:
import numpy as np 
import pandas as pd 
import warnings
import random
from math import sqrt
from collections import Counter

#### Method for calculating K Nearest Neighbors algorithm

In [13]:
def k_nearest_neighbors(data, predict, k=3):
    if len(data) >= k:
        warnings.warn("The dimension of your data is not smaller than K")
    
    # calculating the distance from prediction point to all points
    distances = []
    for group in data:
        for features in data[group]:
            # this is a high level version of calculating the euclidean distance
            euclidean_distance = np.linalg.norm( np.array(features) - np.array(predict) )
            distances.append([euclidean_distance, group])

    # finding the closest k points
    votes = [i[1] for i in sorted(distances)[:k]]
    # finding the most common group in the closest k points
    vote_result = Counter(votes).most_common(1)[0][0]
    # calculating the confidence of the result
    confidence = Counter(votes).most_common(1)[0][1] / k
    return vote_result, confidence

#### Reading the dataset

In [14]:
dataset = pd.read_csv('breast-cancer-wisconsin.data')

#### Replacing the missing data

In [15]:
dataset.replace('?', -99999, inplace=True)

#### Dropping the id column because it won't come to any use

In [16]:
dataset.drop(['id'], 1, inplace=True)

#### Replacing the string data to float

In [17]:
dataset = dataset.astype(float).values.tolist()

#### Shuffling the dataset

In [18]:
random.shuffle(dataset)

#### Splitting the dataset for training and testing

In [19]:
test_size = 0.2
train_data = dataset[int(test_size*len(dataset)):]
test_data = dataset[:int(test_size*len(dataset))]

#### Grouping the data

In [20]:
train_set = { 2:[], 4:[] }
test_set = { 2:[], 4:[] }
for data in train_data:
    train_set[data[-1]].append(data[:-1])
for data in test_data:
    test_set[data[-1]].append(data[:-1])

#### Testing with test data and calculating accuracy

In [21]:
total = 0
correct = 0
for group in test_set:
    for data in test_set[group]:
        vote, confidence = k_nearest_neighbors(train_set, data, 5)
        if group == vote:
            correct += 1
        total += 1

accuracy = correct/total
print(accuracy)

0.9784172661870504
