<a href="https://colab.research.google.com/github/rakesh4real/10hourcodingchallenge/blob/master/01_MALG_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np

In [0]:
def euclidean_dist(X1, X2):
  return np.sqrt(np.sum((X1-X2)**2))

In [3]:
print(euclidean_dist(
    np.array([20]),
    np.array([11])
))

print(euclidean_dist(
    np.array([3, 4]),
    np.array([1, 2])
))


9.0
2.8284271247461903


In [0]:
from collections import Counter

class KNN:
  def __init__(self, k=3, dist_metric="euclidean"):
    self.k = k
    if dist_metric == "euclidean":
      self.dist_metric = euclidean_dist

  # no training (fitting). Only predicting is involved
  # based on training(fit) data 
  def fit(self, train_X, train_y):
    self.X = train_X
    self.y = train_y

  def predict(self, test_X):
      return [self._predict(test_x) for test_x in test_X]
    
  def _predict(self,test_x):
    # 1. calculate all distances
    distances = [self.dist_metric(test_x, train_x) for train_x in self.X]
    # 2. get indices of top k distances
    top_k_dists_indices = np.argsort(distances)[:self.k]
    # 3. based on ys from training set, get 
    # most common votes
    top_k_ys = [self.y[i] for i in top_k_dists_indices]
    return Counter(top_k_ys).most_common(1)[0][0]
  

In [5]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1234)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)


In [6]:
clf = KNN(k=3)

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

clf_acc = np.sum(preds == y_test) / len(y_test)
print(clf_acc)

1.0


In [7]:
clf = KNN(k=5)

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

clf_acc = np.sum(preds == y_test) / len(y_test)
print(clf_acc)

0.9666666666666667


### KNN Regression

In [0]:
class KNNRegression(KNN):
  def _predict(self,test_x):
    # 1. calculate all distances
    distances = [self.dist_metric(test_x, train_x) for train_x in self.X]
    # 2. get indices of top k distances
    top_k_dists_indices = np.argsort(distances)[:self.k]
    # 3. based on ys from training set, get 
    # most common votes
    top_k_ys = [self.y[i] for i in top_k_dists_indices]
    return np.mean(top_k_ys) # ONLY CHANGE

In [9]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

boston = datasets.load_boston()
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1234)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(404, 13)
(404,)
(102, 13)
(102,)


In [14]:
clf = KNNRegression(k=3)

clf.fit(X_train, y_train)
preds = clf.predict(X_test)


mse = np.sqrt(np.sum((preds-y_test)**2))
print(mse)

70.7194299876475


In [15]:
clf = KNNRegression(k=5)

clf.fit(X_train, y_train)
preds = clf.predict(X_test)


mse = np.sqrt(np.sum((preds-y_test)**2))
print(mse)

68.06769571536852


In [17]:
hist = []
for k_val in range(0,10):
  clf = KNNRegression(k=k_val)
  clf.fit(X_train, y_train)
  preds = clf.predict(X_test)
  mse = np.sqrt(np.sum((preds-y_test)**2))
  hist.append((k_val, mse))
  print(f"{k_val}: {mse}")

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


0: nan
1: 70.15140768366662
2: 75.90307964766647
3: 70.7194299876475
4: 67.5550932572815
5: 68.06769571536852
6: 67.45612977665148
7: 70.30415118234049
8: 72.08698088073878
9: 71.92533251080681
