In [1]:
import numpy as np
from collections import Counter
import pandas as pd

def minkowski_dist(x1,x2,p):
    return round(np.power(np.sum(abs((x1-x2))**p),1/p),2)

In [2]:
class KNN:

    def __init__(self, k = 5, weight = False, p = 2 , show_nn = False):
        self.k = k
        self.weight = weight
        self.p = p
        self.show_nn = show_nn

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [minkowski_dist(x, x_train ,self.p) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_idx = np.argsort(distances)[:self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_neighbor_labels = [self.y_train[i] for i in k_idx]  
        k_distance = [distances[i] for i in k_idx]
        if self.weight == True:
            weighted_nn = [1 / i for i in k_distance]
            df = pd.DataFrame(list(zip(k_neighbor_labels,weighted_nn)), columns= ['nn_label','weights'])
            p = df.groupby('nn_label', as_index=False).agg('sum')
            cls = p.nn_label[p['weights'].idxmax()]
        else :    
            # return the most common class label
            df = pd.DataFrame(list(zip(k_neighbor_labels,k_distance)), columns= ['nn_label','Distance'])
            most_common = Counter(k_neighbor_labels).most_common(1)
            #print(k_neighbor_labels)
            cls = most_common[0][0]  
            
        if self.show_nn == True:
            print("For point {} ,class = {}".format(list(round(num,3) for num in list(x)),cls))
            print(df)
            print('\n')
        return cls
        
    def score(self,X,y):
        y_pred=self.predict(X)
        accuracy = np.sum(y == y_pred) / len(y)
        return accuracy

In [3]:
df=pd.read_csv(r'heart.csv')

In [4]:
df['cp']=df['cp'].astype('object')
df['slope']=df['slope'].astype('object')
df['thal']=df['thal'].astype('object')

In [5]:
X=df.iloc[:,df.columns !='target']
y=df.iloc[:,df.columns =='target']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=0.25 , random_state=100)

In [7]:
from feature_engine.encoding import OneHotEncoder
encoder=OneHotEncoder(variables=['cp','slope','thal'] ,drop_last=True)
X_train=encoder.fit_transform(X_train)
X_test=encoder.transform(X_test)

In [8]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [9]:
y_train=y_train.values
y_test=y_test.values

In [10]:
y_train=y_train.reshape(-1)
y_test=y_test.reshape(-1)

In [11]:
k = 3
clf = KNN(k = k)
clf.fit(X_train, y_train)
print("custom KNN classification accuracy", clf.score(X_test,y_test))

custom KNN classification accuracy 0.8157894736842105


In [12]:
k = 3
clf = KNN(k = k, show_nn = True, weight = True)
clf.fit(X_train, y_train)
print("Custom KNN classification accuracy", clf.score(X_test,y_test))

For point [-0.725, -1.579, -0.077, 0.54, -0.42, 0.874, -0.487, -0.668, -0.72, -0.678, -0.961, 1.529, -0.427, 1.088, -0.961, 0.936, -0.818, -0.246] ,class = 1
   nn_label   weights
0         1  0.490196
1         1  0.436681
2         1  0.384615


For point [-0.948, 0.633, 1.107, -0.286, -0.42, 0.874, -0.134, -0.668, 2.263, -0.678, -0.961, 1.529, -0.427, -0.92, 1.04, 0.936, -0.818, -0.246] ,class = 1
   nn_label   weights
0         0  0.363636
1         1  0.347222
2         1  0.321543


For point [-1.953, -1.579, -0.669, -0.587, -0.42, 0.874, 0.879, -0.668, -0.896, -0.678, -0.961, 1.529, -0.427, 1.088, -0.961, 0.936, -0.818, -0.246] ,class = 1
   nn_label   weights
0         1  0.613497
1         1  0.423729
2         1  0.411523


For point [-1.841, 0.633, -0.669, -0.286, -0.42, 0.874, 1.407, 1.498, 2.439, -0.678, -0.961, -0.654, -0.427, -0.92, 1.04, -1.068, 1.222, -0.246] ,class = 0
   nn_label   weights
0         0  0.272480
1         0  0.255754
2         0  0.255102


For point 

For point [0.391, 0.633, -1.853, -0.23, -0.42, 0.874, 0.262, -0.668, -0.808, 0.302, 1.04, -0.654, -0.427, 1.088, -0.961, -1.068, 1.222, -0.246] ,class = 0
   nn_label   weights
0         0  0.462963
1         1  0.462963
2         0  0.350877


For point [0.168, 0.633, 0.041, -1.169, -0.42, -1.034, -1.985, 1.498, 0.947, 0.302, 1.04, -0.654, -0.427, -0.92, 1.04, -1.068, -0.818, 4.057] ,class = 0
   nn_label   weights
0         1  0.363636
1         0  0.337838
2         0  0.267380


For point [-0.055, 0.633, -0.669, -1.094, -0.42, 0.874, -1.632, -0.668, 0.333, 0.302, 1.04, -0.654, -0.427, -0.92, 1.04, -1.068, 1.222, -0.246] ,class = 0
   nn_label   weights
0         0  0.552486
1         0  0.423729
2         0  0.358423


For point [0.503, 0.633, 0.515, -0.474, -0.42, 0.874, 0.614, 1.498, -0.896, -0.678, -0.961, -0.654, 2.342, 1.088, -0.961, 0.936, -0.818, -0.246] ,class = 1
   nn_label   weights
0         1  0.408163
1         1  0.375940
2         1  0.374532


For point [1.173, 0.6

For point [-0.39, 0.633, -1.853, -0.455, -0.42, 0.874, -0.311, 1.498, 0.157, -0.678, -0.961, 1.529, -0.427, -0.92, 1.04, 0.936, -0.818, -0.246] ,class = 1
   nn_label   weights
0         1  0.306748
1         1  0.299401
2         1  0.294118


For point [-1.06, 0.633, -0.965, 0.258, -0.42, -1.034, 1.539, -0.668, -0.896, -0.678, 1.04, -0.654, -0.427, 1.088, -0.961, 0.936, -0.818, -0.246] ,class = 0
   nn_label   weights
0         1  1.123596
1         0  0.621118
2         0  0.549451


For point [0.838, -1.579, 0.396, 0.897, 2.383, 0.874, -1.941, -0.668, 0.772, 2.262, 1.04, -0.654, -0.427, -0.92, 1.04, 0.936, -0.818, -0.246] ,class = 0
   nn_label   weights
0         1  0.322581
1         0  0.236407
2         0  0.213220


Custom KNN classification accuracy 0.8157894736842105
