In [1]:
import pandas as pd
import numpy as np
from math import sqrt

In [2]:
from sklearn import datasets
iris = datasets.load_iris()

In [3]:
iris_data = iris.data
iris_target = iris.target

In [4]:
print(iris_data[:5])

[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]]


In [5]:
# find min and max values in each column
def minmax(dataset):
    minmax = []
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax

In [6]:
minmax = minmax(iris_data)

In [7]:
# nomalize dataset
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

In [8]:
# from sklearn.preprocessing import MinMaxScaler
# scalar = MinMaxScaler()
# iris_data = scalar.fit_transform(iris.data)
normalize_dataset(iris_data, minmax)

In [9]:
print(iris_data)

[[ 0.22222222  0.625       0.06779661  0.04166667]
 [ 0.16666667  0.41666667  0.06779661  0.04166667]
 [ 0.11111111  0.5         0.05084746  0.04166667]
 [ 0.08333333  0.45833333  0.08474576  0.04166667]
 [ 0.19444444  0.66666667  0.06779661  0.04166667]
 [ 0.30555556  0.79166667  0.11864407  0.125     ]
 [ 0.08333333  0.58333333  0.06779661  0.08333333]
 [ 0.19444444  0.58333333  0.08474576  0.04166667]
 [ 0.02777778  0.375       0.06779661  0.04166667]
 [ 0.16666667  0.45833333  0.08474576  0.        ]
 [ 0.30555556  0.70833333  0.08474576  0.04166667]
 [ 0.13888889  0.58333333  0.10169492  0.04166667]
 [ 0.13888889  0.41666667  0.06779661  0.        ]
 [ 0.          0.41666667  0.01694915  0.        ]
 [ 0.41666667  0.83333333  0.03389831  0.04166667]
 [ 0.38888889  1.          0.08474576  0.125     ]
 [ 0.30555556  0.79166667  0.05084746  0.125     ]
 [ 0.22222222  0.625       0.06779661  0.08333333]
 [ 0.38888889  0.75        0.11864407  0.08333333]
 [ 0.22222222  0.75        0.08

In [10]:
print(iris_target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [11]:
# calculate distances between two entries
def cal_dis(row1, row2):
    distance = 0.0
    for i in range(len(row1)):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [12]:
# find the index of k nearest neighbors
def k_neighbor_index(array, k):
    sortedIndex = np.argsort(array)
    index = []
    for i in range(0,k):
        index.append(sortedIndex[i+1])
    return index

In [13]:
# majority selection
def select_major(index, dataset_target):
    a = np.zeros(len(index),dtype=int)
    for i in range(0,len(index)):
        a[i] = dataset_target[index[i]]
    return (np.argmax(np.bincount(a)))

In [14]:
# knn on entire dataset
def knn(dataset, k, dataset_target):
    predictions = []
    for i in range(0,dataset.shape[0]):
        row0 = dataset[i]
        dis = []
        for row in dataset:
            dis.append(cal_dis(row0, row))
        array = np.array(dis)
        index = k_neighbor_index(array= array, k=k)
        predictions.append(select_major(index= index, dataset_target= dataset_target))
    return predictions

In [15]:
predictions = knn(iris_data, 5, iris_target)

In [16]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
print(classification_report(iris_target ,predictions))
print('Confusion Matrix: \n',confusion_matrix(iris_target ,predictions))
print()
print('Accuracy: ', accuracy_score(iris_target ,predictions))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       0.92      0.94      0.93        50
          2       0.94      0.92      0.93        50

avg / total       0.95      0.95      0.95       150

Confusion Matrix: 
 [[50  0  0]
 [ 0 47  3]
 [ 0  4 46]]

Accuracy:  0.953333333333


In [17]:
# knn on signle input
def knn_single(dataset, k, newrow, dataset_target):
    for i in range(0,dataset.shape[0]):
        row0 = newrow
        dis = []
        for row in dataset:
            dis.append(cal_dis(row0, row))
        array = np.array(dis)
        index = k_neighbor_index(array=array, k=k)
        prediction = select_major(index= index, dataset_target= dataset_target)
    return prediction 

In [18]:
# predict on a new input
newrow = [5.7,2.9,4.2,1.3]
label = knn_single(iris_data, 5, newrow, iris_target)
print('Data=%s, Predicted: %s' % (newrow, label))

Data=[5.7, 2.9, 4.2, 1.3], Predicted: 2


In [19]:
# weighted knn
def weighted_classify(dataset,k, dataset_target, weight_method):
    predictions = []
    for i in range(0,dataset.shape[0]):
        row0 = dataset[i]
        dis = []
        for row in dataset:
            dis.append(cal_dis(row0, row))
        array = np.array(dis)
        sortedDistIndex = np.argsort(array)

        classCount = {}
        for j in range(k):
            index = sortedDistIndex[j+1]
            voteLabel = dataset_target[index]
            if weight_method == 1:
                weight = 1/(dis[index]+1)
            else:
                weight = 1/(dis[index]**2+1)
            classCount[voteLabel] = classCount.get(voteLabel, 0) + weight*1
            maxCount = 0
            for key, value in classCount.items():
                if value > maxCount:
                    maxCount = value
                    classes = key
        predictions.append(classes)
    return predictions

In [20]:
predictions = weighted_classify(iris_data, 99, iris_target, weight_method=1)
print(predictions)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]


In [21]:
print(classification_report(iris_target ,predictions))
print('Confusion Matrix: \n',confusion_matrix(iris_target ,predictions))
print()
print('Accuracy: ', accuracy_score(iris_target ,predictions))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       0.81      0.94      0.87        50
          2       0.93      0.78      0.85        50

avg / total       0.91      0.91      0.91       150

Confusion Matrix: 
 [[50  0  0]
 [ 0 47  3]
 [ 0 11 39]]

Accuracy:  0.906666666667


In [22]:
predictions = weighted_classify(iris_data, 99, iris_target, weight_method=2)
print(predictions)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1]


In [23]:
print(classification_report(iris_target ,predictions))
print('Confusion Matrix: \n',confusion_matrix(iris_target ,predictions))
print()
print('Accuracy: ', accuracy_score(iris_target ,predictions))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       0.76      0.88      0.81        50
          2       0.86      0.72      0.78        50

avg / total       0.87      0.87      0.87       150

Confusion Matrix: 
 [[50  0  0]
 [ 0 44  6]
 [ 0 14 36]]

Accuracy:  0.866666666667
