In [317]:
import numpy as np
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt

In [318]:
from sklearn import datasets
features_set, prediction_set = datasets.make_friedman1(n_samples=100, n_features=5)

In [319]:
features_set[:5], prediction_set[:5]

(array([[0.31058228, 0.53365385, 0.96884976, 0.27499495, 0.78731975],
        [0.57212681, 0.11233317, 0.15461977, 0.25807363, 0.62033927],
        [0.14383046, 0.51450369, 0.41091861, 0.88040051, 0.31837716],
        [0.85956871, 0.12141997, 0.89319138, 0.47194862, 0.04879158],
        [0.41750939, 0.27599129, 0.46632992, 0.18937885, 0.89147419]]),
 array([16.05781055, 10.0735569 , 12.85853495, 11.27584068,  9.91530892]))

In [320]:
def euclidean_distance(v, w):
  return np.sqrt(
                  sum(
                      pow(v_i-w_i,2) for v_i, w_i in zip(v,w)
                      )
                  )
  

def manhattan_distance(v, w):
  return sum(
            abs(v_i - w_i) for v_i, w_i in zip(v,w)
            )

In [321]:
class knn_regression:
  """Implementation of KNN Regression class"""

  def __init__(self, k_val=1, distance_metric=None):
    self.k_val = k_val
    self.distance_metric = distance_metric
    self.combined = None

  def average(self, v):
    return np.sum(v) / len(v)

  # fitting the data into a single a set
  def fit(self, X, y):
    P = np.concatenate((X.reshape(len(X), X.shape[1]), y.reshape(len(y), 1)), 1)
    self.combined = P

  # predicting the value for the test set
  def predict(self, X, y):
    pred = []
    for i in X:
      if (self.distance_metric == 'euclidean'):
        sort_val = sorted(self.combined, key=lambda point: euclidean_distance(point[:-1], i))
      
      else:
        sort_val = sorted(self.combined, key=lambda point: manhattan_distance(point[:-1], i))

      k_nearest_neighbors = [point[-1] for point in sort_val[:self.k_val]]

      pred.append(self.average(k_nearest_neighbors))

    return pred

In [322]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_set, prediction_set, test_size=0.2, random_state=42)

In [323]:
knn_reg = knn_regression(k_val=6, distance_metric='euclidean')
knn_reg.fit(X_train, y_train)

pred = np.array(knn_reg.predict(X_test, y_test))

In [324]:
np.concatenate((pred.reshape(len(pred), 1), y_test.reshape(len(y_test), 1)), 1)

array([[18.80443999, 20.74206627],
       [12.17067732, 13.81723939],
       [16.78664773, 17.81961886],
       [17.41326548, 17.48249302],
       [21.17906039, 20.50016097],
       [14.1029645 , 16.10013472],
       [12.96752482, 11.51405781],
       [ 8.22259185,  6.02354253],
       [13.17264055, 12.6055153 ],
       [15.14630054, 16.05781055],
       [18.15771872, 21.87288581],
       [10.08038557,  5.83388314],
       [ 9.99541499,  6.53241291],
       [14.45327519, 16.73993945],
       [17.94665643, 23.5974251 ],
       [12.34452588,  9.91530892],
       [19.34865246, 20.81787222],
       [17.22455167, 21.19158942],
       [11.02522055, 12.30472449],
       [19.04399534, 16.59545421]])

In [325]:
def root_mean_squared_error(v, w):
  rmse = (1 / len(v)) * sum( pow((v_i - w_i), 2) for v_i, w_i in zip(v,w))
  return np.sqrt(rmse)

rmse = root_mean_squared_error(pred, y_test)
print(rmse)

2.573956780964302


In [326]:
# calculating rmse for different values of k
rmses = []

for i in range(1, 20):
  knn_reg2 = knn_regression(k_val=i, distance_metric='euclidean')
  knn_reg2.fit(X_train, y_train)

  pred = np.array(knn_reg2.predict(X_test, y_test))
  rmses.append([root_mean_squared_error(pred, y_test), i])

In [327]:
rmses

[[2.9952397217404076, 1],
 [2.4400156248391847, 2],
 [2.6762905080560886, 3],
 [2.617799017697007, 4],
 [2.4437655864343766, 5],
 [2.573956780964302, 6],
 [2.5836381495198135, 7],
 [2.604259803767955, 8],
 [2.8123854073521906, 9],
 [2.8041221229067084, 10],
 [2.68395742940159, 11],
 [2.818215117477416, 12],
 [2.8967167025099845, 13],
 [2.9477094189251973, 14],
 [3.034969382767189, 15],
 [3.0525522187510528, 16],
 [3.1466153829351846, 17],
 [3.2192024901213845, 18],
 [3.2317805221666656, 19]]

In [330]:
# taking k=5

knn_reg2 = knn_regression(k_val=5, distance_metric='euclidean')
knn_reg2.fit(X_train, y_train)

pred = np.array(knn_reg2.predict(X_test, y_test))
print(root_mean_squared_error(pred, y_test))

2.4437655864343766


In [331]:
np.concatenate((pred.reshape(len(pred), 1), y_test.reshape(len(y_test), 1)), 1)

array([[18.7304816 , 20.74206627],
       [12.46198515, 13.81723939],
       [16.80371061, 17.81961886],
       [17.90471101, 17.48249302],
       [22.0913382 , 20.50016097],
       [14.04504317, 16.10013472],
       [11.77385656, 11.51405781],
       [ 8.40360588,  6.02354253],
       [12.85862781, 12.6055153 ],
       [13.82643859, 16.05781055],
       [17.44141431, 21.87288581],
       [ 9.01027911,  5.83388314],
       [ 9.65465532,  6.53241291],
       [14.99812137, 16.73993945],
       [18.24440292, 23.5974251 ],
       [12.31375957,  9.91530892],
       [20.53492806, 20.81787222],
       [17.79094777, 21.19158942],
       [10.68828955, 12.30472449],
       [18.50367236, 16.59545421]])