In [11]:
import os
import numpy as np
import operator
from sklearn.neighbors import KNeighborsClassifier as kNN
import time

In [12]:
training_dir = 'data/knn-digits/training_digits'
test_dir = 'data/knn-digits/test_digits'
k_global = 3

In [13]:
# 将32*32的数据转为1*1024的数据
def img2vector(filename):
    return_vector = np.zeros((1, 1024))
    f = open(filename)
    for i in range(32):
        line = f.readline()
        for j in range(32):
            return_vector[0, 32 * i + j] = int(line[j])
    return return_vector


def load_training_data():
    training_label = []
    training_file_list = os.listdir(training_dir)
    training_size = len(training_file_list)
    training_data = np.zeros((training_size, 1024))
    for i in range(training_size):
        filename = training_file_list[i]
        label = int(filename.split('_')[0])
        training_label.append(label)
        training_data[i, :] = img2vector(training_dir + '/' + filename)
    return training_data, training_label


def load_test_data():
    test_file_list = os.listdir(test_dir)
    test_size = len(test_file_list)
    for i in range(test_size):
        filename = test_file_list[i]
        label = int(filename.split('_')[0])
        test_data = img2vector(test_dir + '/' + filename)
        yield test_data, label

In [4]:
# 对单条数据进行分类
def classify0(in_data, data_set, labels, k):
    data_size = data_set.shape[0]
    diff_mat = np.tile(in_data, (data_size, 1)) - data_set
    distances = (diff_mat ** 2).sum(axis=1) ** 0.5
    argsort_distances = distances.argsort()
    class_count = {}
    for i in range(k):
        label = labels[argsort_distances[i]]
        class_count[label] = class_count.get(label, 0) + 1
    sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_class_count[0][0]


# knn的总体流程
def knn():
    error_count = 0
    correct_count = 0
    training_data, training_label = load_training_data()
    for test_data, test_label in load_test_data():
        pred_label = classify0(test_data, training_data, training_label, k_global)
        if pred_label == test_label:
            correct_count += 1
        else:
            error_count += 1
    num_test = error_count + correct_count
    acc = correct_count / (correct_count + error_count)
    print('test number: %d, failure number: %d, accuracy: %.6f' % (num_test, error_count, acc))

In [5]:
time_begin = time.time()
print('use knn implementing from scratch:')
knn()
time_end = time.time()
print('took %f.4 s' % (time_end - time_begin))

use knn implementing from scratch:
test number: 946, failure number: 11, accuracy: 0.988372
took 6.529421.4 s


In [6]:
def knn_sklearn(algorithm):
    error_count = 0
    correct_count = 0
    training_data, training_label = load_training_data()
    classifier = kNN(n_neighbors=k_global, algorithm=algorithm)
    classifier.fit(training_data, training_label)
    for test_data, test_label in load_test_data():
        pred_label = classifier.predict(test_data)
        if pred_label == test_label:
            correct_count += 1
        else:
            error_count += 1
    num_test = error_count + correct_count
    acc = correct_count / (correct_count + error_count)
    print('test number: %d, failure number: %d, accuracy: %.6f' % (num_test, error_count, acc))

In [7]:
time_begin = time.time()
print('use knn from sklearn:')
knn_sklearn(algorithm='auto')
time_end = time.time()
print('took %f.4 s' % (time_end - time_begin))

use knn from sklearn:
test number: 946, failure number: 10, accuracy: 0.989429
took 1.637852.4 s


In [8]:
time_begin = time.time()
print('use knn from sklearn:')
knn_sklearn(algorithm='brute')
time_end = time.time()
print('took %f.4 s' % (time_end - time_begin))

use knn from sklearn:
test number: 946, failure number: 10, accuracy: 0.989429
took 1.544849.4 s


In [9]:
time_begin = time.time()
print('use knn from sklearn:')
knn_sklearn(algorithm='kd_tree')
time_end = time.time()
print('took %f.4 s' % (time_end - time_begin))

use knn from sklearn:
test number: 946, failure number: 10, accuracy: 0.989429
took 3.461061.4 s


In [10]:
time_begin = time.time()
print('use knn from sklearn:')
knn_sklearn(algorithm='ball_tree')
time_end = time.time()
print('took %f.4 s' % (time_end - time_begin))

use knn from sklearn:
test number: 946, failure number: 10, accuracy: 0.989429
took 3.181452.4 s
