Import libraries

In [1]:
import numpy as np

Load Iris dataset

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target

Review 3 classes of the dataset and remove 1

In [11]:
print('Number of classes: {}; Values: {}'.format(len(np.unique(iris_y)), np.unique(iris_y)))
print('Number of data points: {}'.format(len(iris_y)))

Number of classes: 3; Values: [0 1 2]
Number of data points: 150


In [16]:
# Check number of data points from each class
X0 = iris_X[iris_y == 0, :]
X1 = iris_X[iris_y == 1, :]
X2 = iris_X[iris_y == 2, :]

print('Class 0: {} data points; Class 1: {} data points; Class 2: {} data points.'.format(len(X0), len(X1), len(X2)))

print('For this notebook we use class 0 and class 1 only')

Class 0: 50 data points; Class 1: 50 data points; Class 2: 50 data points.


In [19]:
iris_y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

Data to use:

In [21]:
X = iris_X[:100, :]
y = iris_y[:100]

X.shape, y.shape

((100, 4), (100,))

Split datasets and normalize input features

In [23]:
def normalize_features(X):
  '''
  Normalize the features (each columns) individually.
  After this, each column should have mean = 0 and standard deviation = 1.
  '''
  mean = np.mean(X, axis=0)
  std = np.std(X, axis=0)
  std[std == 0] = 1e-8 # replace 0 with very small number to avoid division by zero error

  X_normalized = (X - mean) / std

  return X_normalized

In [24]:
def train_test_split(X, y, test_size=0.2):
  if X.shape[0] != y.shape[0]:
    raise ValueError('X and y must have the same number of samples.')

  num_samples = X.shape[0]
  test_samples = int(num_samples * test_size)

  # Randomly shuffle before sampling
  indices = np.arange(num_samples)
  np.random.shuffle(indices)

  # Split data
  X_train = X[indices[:-test_samples]]
  X_test = X[indices[-test_samples:]]

  y_train = y[indices[:-test_samples]]
  y_test = y[indices[-test_samples:]]

  return X_train, X_test, y_train, y_test

In [None]:
X = normalize_features(X)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

Calc Euclidean distance between 2 data points

In [2]:
def euclidean_dist(p1, p2):
  sum_square = np.sum(np.square(p1 - p2))
  dist = np.sqrt(sum_square)

  return dist

In [4]:
# Test distance
point1 = np.array((1, 2, 3, 7))
point2 = np.array((1, 1, 1, 10))

print(euclidean_dist(point1, point2))

3.7416573867739413


Select K nearest neighbors for an instance (data point) based on distance

In [36]:
def select_k_neighbors(X_train, instance, k=3):
  distances = []

  for index in range(len(X_train)):
    dist = euclidean_dist(X_train[index], instance)
    # save distance and index of the data point that has this distance value with input distance
    distances.append((dist, index))

  distances.sort(key=lambda x: x[0])

  k_neighbors = distances[:k]
  return k_neighbors

Decide which class a data point (input instance) belongs to based on K nearest neighbors

In [34]:
def choose_class(y_train, k_neighbors):
  class_votes = {}

  for x in range(len(k_neighbors)):
    index = k_neighbors[x][1]
    response = y_train[index]

    class_votes[response] = class_votes.get(response, 0) + 1

  # sort the dict based on its values (count of each class)
  sorted_class_votes = sorted(class_votes.items(), key=lambda x: x[1], reverse=True)
  most_voted_class = sorted_class_votes[0][0]

  return most_voted_class

Function to calc accuracy of predictions

In [39]:
from sklearn.metrics import confusion_matrix

def measure_acc(y_test, predictions):
  # Ensure y_test and predictions are numpy arrays
  if not isinstance(y_test, np.ndarray):
      y_test = np.array(y_test)
  if not isinstance(predictions, np.ndarray):
      predictions = np.array(predictions)

  # Check if y_test and predictions have the same shape
  if y_test.shape != predictions.shape:
      print("Error: y_test and predictions must have the same shape.")

  # calc accuracy = percentage of same values count over all values
  acc = np.mean(y_test == predictions)

  cm = confusion_matrix(y_test, predictions)

  return acc, cm

Main code snippet:
- Instances are from test set
- Data points to calc distance with instances are from train set

In [41]:
k_list = [3, 5, 7, 9, 11]

for k in k_list:
  predictions = []
  for index in range(len(X_test)):
    x_instance = X_test[index]
    k_neighbors = select_k_neighbors(X_train, x_instance, k=k)
    voted_class = choose_class(y_train, k_neighbors)

    predictions.append(voted_class)

  accuracy, cm = measure_acc(y_test, predictions)
  print('k = {}; acc = {}'.format(k, accuracy))
  print('Confusion matrix for k = {}'.format(k))
  print(cm, '\n')

k = 3; acc = 1.0
Confusion matrix for k = 3
[[ 7  0]
 [ 0 13]] 

k = 5; acc = 1.0
Confusion matrix for k = 5
[[ 7  0]
 [ 0 13]] 

k = 7; acc = 1.0
Confusion matrix for k = 7
[[ 7  0]
 [ 0 13]] 

k = 9; acc = 1.0
Confusion matrix for k = 9
[[ 7  0]
 [ 0 13]] 

k = 11; acc = 1.0
Confusion matrix for k = 11
[[ 7  0]
 [ 0 13]] 

