In [301]:
import numpy as np
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt

In [302]:
from sklearn import datasets
features_set, prediction_set = datasets.make_friedman1(n_samples=100, n_features=5)

In [316]:
features_set[:5], prediction_set[:5]

(array([[0.45712485, 0.23960421, 0.51013998, 0.34475269, 0.15799833],
        [0.20494553, 0.59044427, 0.34259411, 0.83645975, 0.60744485],
        [0.99673362, 0.8215444 , 0.76134545, 0.8158116 , 0.78242937],
        [0.54239744, 0.1198728 , 0.76636525, 0.47518037, 0.83637972],
        [0.93328895, 0.99943568, 0.24949841, 0.70378605, 0.1195731 ]]),
 array([ 7.6130294 , 15.60805132, 18.82473956, 12.3811594 , 10.98740875]))

In [304]:
def euclidean_distance(v, w):
  return np.sqrt(
                  sum(
                      pow(v_i-w_i,2) for v_i, w_i in zip(v,w)
                      )
                  )
  

def manhattan_distance(v, w):
  return sum(
            abs(v_i - w_i) for v_i, w_i in zip(v,w)
            )

In [305]:
class knn_regression:
  """Implementation of KNN Regression class"""

  def __init__(self, k_val=1, distance_metric=None):
    self.k_val = k_val
    self.distance_metric = distance_metric
    self.combined = None

  def average(self, v):
    return np.sum(v) / len(v)

  # fitting the data into a single a set
  def fit(self, X, y):
    P = np.concatenate((X.reshape(len(X), X.shape[1]), y.reshape(len(y), 1)), 1)
    self.combined = P

  # predicting the value for the test set
  def predict(self, X, y):
    pred = []
    for i in X:
      if (self.distance_metric == 'euclidean'):
        sort_val = sorted(self.combined, key=lambda point: euclidean_distance(point[:-1], i))
      
      else:
        sort_val = sorted(self.combined, key=lambda point: manhattan_distance(point[:-1], i))

      k_nearest_neighbors = [point[-1] for point in sort_val[:self.k_val]]

      pred.append(self.average(k_nearest_neighbors))

    return pred

In [306]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_set, prediction_set, test_size=0.2, random_state=42)

In [307]:
knn_reg = knn_regression(k_val=6, distance_metric='euclidean')
knn_reg.fit(X_train, y_train)

pred = np.array(knn_reg.predict(X_test, y_test))

In [308]:
np.concatenate((pred.reshape(len(pred), 1), y_test.reshape(len(y_test), 1)), 1)

array([[ 8.79370824,  5.83299741],
       [12.73105449,  9.17070627],
       [15.00794069, 19.04609724],
       [14.56937148, 12.07022538],
       [12.51417613, 11.65442323],
       [20.92221299, 20.75453836],
       [17.83970774, 13.20348722],
       [12.38478112, 14.04735742],
       [19.8577702 , 19.43612453],
       [ 9.86569712,  7.6130294 ],
       [17.47271444, 23.49642705],
       [12.87899661, 12.95365147],
       [15.70243611, 16.20929013],
       [18.94316046, 19.0832695 ],
       [10.02710329,  7.60963189],
       [15.17905888, 10.98740875],
       [12.65325104, 13.10978469],
       [16.39547089, 17.705057  ],
       [16.52709759, 16.14361234],
       [10.01819796,  6.3255843 ]])

In [309]:
def root_mean_squared_error(v, w):
  rmse = (1 / len(v)) * sum( pow((v_i - w_i), 2) for v_i, w_i in zip(v,w))
  return np.sqrt(rmse)

rmse = root_mean_squared_error(pred, y_test)
print(rmse)

2.738246042130682


In [310]:
# calculating rmse for different values of k
rmses = []

for i in range(1, 20):
  knn_reg2 = knn_regression(k_val=i, distance_metric='euclidean')
  knn_reg2.fit(X_train, y_train)

  pred = np.array(knn_reg2.predict(X_test, y_test))
  rmses.append([root_mean_squared_error(pred, y_test), i])

In [311]:
rmses

[[3.7263741266161503, 1],
 [3.213183313248668, 2],
 [3.091447048935396, 3],
 [2.8642857167022124, 4],
 [2.7923998970420754, 5],
 [2.738246042130682, 6],
 [2.678287891621569, 7],
 [2.791591134793272, 8],
 [2.9293894243982876, 9],
 [2.9769331175556553, 10],
 [2.95009361990289, 11],
 [2.990059172073246, 12],
 [3.024059170869205, 13],
 [3.098481147131812, 14],
 [3.1775097424284153, 15],
 [3.238707203395234, 16],
 [3.201875563495806, 17],
 [3.319073898924869, 18],
 [3.3227604792017695, 19]]

In [314]:
# taking k=7

knn_reg2 = knn_regression(k_val=7, distance_metric='euclidean')
knn_reg2.fit(X_train, y_train)

pred = np.array(knn_reg2.predict(X_test, y_test))
print(root_mean_squared_error(pred, y_test))

2.678287891621569


In [315]:
np.concatenate((pred.reshape(len(pred), 1), y_test.reshape(len(y_test), 1)), 1)

array([[ 8.5136942 ,  5.83299741],
       [12.44623147,  9.17070627],
       [14.97511133, 19.04609724],
       [14.63301066, 12.07022538],
       [12.74813084, 11.65442323],
       [20.78063112, 20.75453836],
       [16.79011743, 13.20348722],
       [12.89564536, 14.04735742],
       [19.96043766, 19.43612453],
       [ 9.46991868,  7.6130294 ],
       [17.91610415, 23.49642705],
       [13.06083411, 12.95365147],
       [16.3748681 , 16.20929013],
       [18.93780597, 19.0832695 ],
       [10.29199305,  7.60963189],
       [15.15559986, 10.98740875],
       [13.75309795, 13.10978469],
       [15.06686762, 17.705057  ],
       [16.31106161, 16.14361234],
       [10.81194907,  6.3255843 ]])