In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
from collections import Counter
import warnings
from itertools import chain
style.use('ggplot')
%matplotlib inline

In [2]:
class KNN:
    
    def __init__(self, k=3, distance_metric="Euclidean"):
        self.k = k
        self.distance_metric = "Euclidean"
        
        
    def __distance(self, x, y):
        if(self.distance_metric=="Euclidean"):
            return np.linalg.norm( (x-y), 2 )
        elif(self.distance_metric == "Manhattan"):
            return np.linalg.norm((x-y), 1)
        else:
            print("Please enter a valid distance metric")
            return -99999
        
        
    def __calc_all_dist(self, train_data, point):
        dists = []
        [dists.append( [self.__distance(i[:-1], point) , i[-1]] ) for i in train_data]
        return dists
        
        
    def __calc_votes(self, train_data, point):
        distances = self.__calc_all_dist(train_data, point)
        votes = []
        [votes.append(i[1]) for i in sorted(distances)[:self.k]]
        return Counter(votes).most_common(1)[0][0] # We want to find the (1st) most common category in this list containing the k nearest neighbors
        
    
    def predict(self, train_data, test_data, test_size=0.2): # The data to be used for calculating the distance metric and the data whose category is to be predicted
        if len(np.unique(train_data[:, -1])) > self.k:
            warnings.warn("Hey! The number of categories is greater than k. Please increase the value of k")
        result = []
        [result.append(self.__calc_votes(train_data, point)) for point in test_data[:, :-1]]
        return result
    
    def confidence(self, train_data, point, correct_class):
        distances = self.__calc_all_dist(train_data, point)
        votes = []
        [votes.append(i[1]) for i in sorted(distances)[:self.k]]
        return 1 - ( votes.count(correct_class)/len(votes) )
        
    
    def score(self, train_data, test_data):
        result = self.predict(train_data, test_data)
        correct = 0
        for i in range(len(result)):
            if result[i] == test_data[i, -1]:
                correct += 1
            else:
                print(f"Confidence of the incorrect prediction number {i} = {self.confidence(test_data, test_data[i, :-1], test_data[i, -1])}")
        return correct/len(test_data)
    
#     def optimal_k(self, full_data, cross_folds=3, test_size=0.2):
        
#         scores = []
#         maxi = 3
#         for k in range(3, 40, 5):
#             self.k = k
#             folds_scores = np.empty(shape=(cross_folds))
            
#             for i in range(cross_folds):
#                 np.random.shuffle(full_data)
#                 train_data = full_data[:-int(len(full_data)*test_size), :]
#                 test_data = full_data[-int(len(full_data)*test_size):, :]
#                 folds_scores = np.concatenate((folds_scores, self.score(train_data, test_data)), axis=None)
                
#             new_score = folds_scores.mean()
#             scores.append(new_score)
#             if new_score == max(scores):
#                 maxi = k
        
#         return maxi

### Applying our model on real-data

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("breast-cancer-wisconsin.data", header=None, na_values='?').drop(0, axis=1)

In [5]:
df.columns = ["clump_thick", "unif_cell_size", "unif_cell_shape", "marg_adh", "single_epi_cell_size", "bare_nuc", "bland_chr", "normal_nucleo", "mitosis", "class"]

In [6]:
df.fillna(np.floor(df['bare_nuc'].mean()), inplace=True)

In [7]:
df.head()

Unnamed: 0,clump_thick,unif_cell_size,unif_cell_shape,marg_adh,single_epi_cell_size,bare_nuc,bland_chr,normal_nucleo,mitosis,class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2


In [8]:
wisc_df = df.sample(frac=1, random_state=12)

In [9]:
data = wisc_df.to_numpy()
data

array([[ 1.,  1.,  1., ...,  1.,  1.,  2.],
       [ 2.,  1.,  1., ...,  1.,  1.,  2.],
       [ 4.,  1.,  1., ...,  1.,  1.,  2.],
       ...,
       [ 6., 10., 10., ...,  3.,  3.,  4.],
       [ 1.,  1.,  1., ...,  1.,  1.,  2.],
       [ 3.,  1.,  1., ...,  1.,  1.,  2.]])

In [10]:
test_size = 0.2
train_data = data[:-int(len(data)*test_size), :]
test_data = data[-int(len(data)*test_size):, :]

In [11]:
model = KNN(k=5)

In [12]:
result = model.predict(train_data, test_data)

In [13]:
model.score(train_data, test_data)

Confidence of the incorrect prediction number 0 = 0.4
Confidence of the incorrect prediction number 133 = 0.8


0.9856115107913669