# Implementing distance weighted KNN

**Importing necessary libraries and datasets**

In [9]:
import numpy as np
import scipy.spatial
from sklearn import metrics, datasets
from sklearn.model_selection import train_test_split


# Initial imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**Dividing the data into training set and testing set**

In [10]:
# Load iris data and store in dataframe

iris = datasets.load_iris()

df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


**Performing KNN Classification**

In [3]:
# class KNN:
# #     constructor for getting the value of k
#     def __init__(self, k):
#         self.k = k
        
# #         assigning the value of X_train matrix and y_train vector
#     def fit(self, X, y):
#         self.X_train = X
#         self.y_train = y
        
# #         function to calculate eucledian distance
#     def distance(self, X1, X2):
#         distance = scipy.spatial.distance.euclidean(X1, X2)
    
# #     function to predict the class on X_test
#     def predict(self, X_test):
#         final_output = []
        
# #         calculating eucledian distance of each test example with each of the training example
#         for i in range(len(X_test)):
#             d = []
#             votes = []
#             for j in range(len(X_train)):
#                 dist = scipy.spatial.distance.euclidean(X_train[j] , X_test[i])
#                 d.append([dist, y_train[j]])
                
# #                 sorting the list in increasing order and choosing the k nearest neighbors
#             d.sort()
#             d = d[0:self.k]
        
# #         calculating the class based on the distance wieghted knn
#             freq1 = 0
#             freq2 = 0
#             freq3 = 0
#             for dist,result in d:
#                 if result == 0: 
#                     freq1 += (1 / dist) 
#                 elif result == 1:  
#                     freq2 += (1 /dist)
#                 else:
#                     freq3 += (1/dist)  
                    
#             maximum = np.max([freq1,freq2,freq3])
#             if maximum == freq1:
#                 ans = 0
#             elif maximum == freq2:
#                 ans = 1
#             else:
#                 ans = 2
#             final_output.append(ans)
            
#         return final_output
    
# #     checks the accuracy of the predicted values by running the prediction and comparing it y_test
#     def score(self, X_test, y_test):
#         predictions = self.predict(X_test)
#         return ((predictions == y_test).sum() / len(y_test)) * 100

**Testing our model with different values of K**

In [11]:

# Separate X and y data

X = df.drop('target', axis=1)
y = df.target

In [12]:
def minkowski_distance(a, b, p=1):
    
    # Store the number of dimensions
    dim = len(a)
    
    # Set initial distance to 0
    distance = 0
    
    # Calculate minkowski distance using parameter p
    for d in range(dim):
        distance += abs(a[d] - b[d])**p
        
    distance = distance**(1/p)
    
    return distance


# Test the function

minkowski_distance(a=X.iloc[0], b=X.iloc[1], p=1)

0.6999999999999993

In [14]:
test_pt = [4.8, 2.7, 2.5, 0.7]

# Calculate distance between test_pt and all points in X

distances = []

for i in X.index:
    
    distances.append(minkowski_distance(test_pt, X.iloc[i]))
    
df_dists = pd.DataFrame(data=distances, index=X.index, columns=['dist'])
df_dists.head()

Unnamed: 0,dist
0,2.7
1,2.0
2,2.3
3,2.1
4,2.7


In [15]:

# Find the 5 nearest neighbors

df_nn = df_dists.sort_values(by=['dist'], axis=0)[:5]
df_nn

Unnamed: 0,dist
98,1.4
57,1.5
93,1.7
24,1.8
30,1.8


In [16]:
from collections import Counter

# Create counter object to track the labels

counter = Counter(y[df_nn.index])

# Get most common label of all the nearest neighbors

counter.most_common()[0][0]

1

In [17]:
from sklearn.preprocessing import StandardScaler

# Split the data - 75% train, 25% test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                   random_state=1)

# Scale the X data

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
def knn_predict(X_train, X_test, y_train, y_test, k, p):
    
    # Counter to help with label voting
    from collections import Counter
    
    # Make predictions on the test data
    # Need output of 1 prediction per test data point
    y_hat_test = []

    for test_point in X_test:
        distances = []

        for train_point in X_train:
            distance = minkowski_distance(test_point, train_point, p=p)
            distances.append(distance)
        
        # Store distances in a dataframe
        df_dists = pd.DataFrame(data=distances, columns=['dist'], 
                                index=y_train.index)
        
        # Sort distances, and only consider the k closest points
        df_nn = df_dists.sort_values(by=['dist'], axis=0)[:k]

        # Create counter object to track the labels of k closest neighbors
        counter = Counter(y_train[df_nn.index])

        # Get most common label of all the nearest neighbors
        prediction = counter.most_common()[0][0]
        
        # Append prediction to output list
        y_hat_test.append(prediction)
        
    return y_hat_test


# Make predictions on test dataset
y_hat_test = knn_predict(X_train, X_test, y_train, y_test, k=100, p=1)

print(y_hat_test)

[0, 0, 2, 0, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0]


In [22]:
print(metrics.accuracy_score(y_test, y_hat_test))

0.5789473684210527
