In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
# notebook will reload external python modules
%load_ext autoreload
%autoreload 2

In [2]:
# import data
names = ['sepal_length','sepal_width','petal_length', 'petal_width', 'class']
data_raw = pd.read_csv('./data/iris.data', names = names, header=None)
# use all data
data_used = data_raw

all_num = data_used.shape[0]
val_num = all_num // 5
test_num = all_num // 5
train_num = all_num - val_num - test_num

print('all_num:{}, val_num:{}, test_num:{}, train_num:{}'.format(all_num, val_num, test_num, train_num))

all_num:150, val_num:30, test_num:30, train_num:90


In [3]:
# change the label into {0, 1, 2}
data_mdf = data_used.copy()
for idx in range(all_num):
    item = data_mdf.loc[idx, 'class']
    if item == 'Iris-setosa':
        data_mdf.loc[idx, 'class'] = 0
    elif item == 'Iris-versicolor':
        data_mdf.loc[idx, 'class'] = 1
    elif item == 'Iris-virginica':
        data_mdf.loc[idx, 'class'] = 2
    else:
        print('item={}'.format(item))
        raise ValueError("data_label unmatched")

# shuffle the set
data_shuffled = np.array(data_mdf).astype(float)
np.random.shuffle(data_shuffled)

# mask
train_mask = range(train_num)
val_mask = range(train_num,train_num+val_num)
test_mask = range(train_num + val_num, train_num + val_num + test_num)
# devide set into 3 parts (train, val, test)
x_train = data_shuffled[train_mask, 0:4]
y_train = data_shuffled[train_mask, 4]
x_val = data_shuffled[val_mask, 0:4]
y_val = data_shuffled[val_mask, 4]
x_test = data_shuffled[test_mask, 0:4]
y_test = data_shuffled[test_mask, 4]


In [4]:
# preprocess the data
mean = np.mean(x_train[:4], axis=0, keepdims=True)
std = np.std(y_train[:4], axis=0, keepdims=True)
print('mean:{}, std:{}'.format(mean, std))
x_train = (x_train - mean) / (std + 1e-7)
x_val = (x_val - mean) / (std + 1e-7)
x_test = (x_test - mean) / (std + 1e-7)

mean:[[5.8   3.175 3.375 0.95 ]], std:[0.8291562]


In [5]:
# knn
class kNN():
    def __init__(self, X, Y, class_num):
        '''
        x : x_train
        y : y_train
        class_num : the num of class in y
        '''
        self.X = X
        self.Y = Y
        self.class_num = class_num
    def pred(self, x, k):
        preds = []
        for item in x:
            dst = np.sum((self.X - item)**2, axis=1)**0.5
            indices = np.argsort(dst)[:k]
            y = self.Y[indices]
            count = np.zeros(self.class_num)
            for class_idx in range(self.class_num):
                count[class_idx] = np.count_nonzero(y == class_idx)
            preds.append(np.argmax(count, axis=0))
        return preds

In [6]:
def check_accuracy(Y, fx):
    num_correct = np.sum(Y == fx)
    num_samples = len(Y)
    acc = float(num_correct) / num_samples
    print('Got {} / {} correct {:.2%}'.format(num_correct, num_samples, acc))

In [7]:
class_num = 3
k = 5
model = kNN(x_train, y_train, class_num)
preds = model.pred(x_val, k)
check_accuracy(y_val, preds)

Got 30 / 30 correct 100.00%


In [8]:
preds = model.pred(x_test, k)
check_accuracy(y_test, preds)

Got 29 / 30 correct 96.67%
