# 示例：手写识别系统

---

Created on 2019-06-05

Update on 2019-06-05

Coder: 6+x

Github: https://github.com/Jiachengciel/ML-in-Action

---

图像为32x32像素的黑白图像

'trainingDigits' 中包含约2000个实例

‘testDigits’ 中包含约900个实例

需要将32x32的信息转换为1x1024

In [1]:
import os
import numpy as np
import collections
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab

%matplotlib notebook
pylab.rcParams['figure.figsize'] = (5, 5)
matplotlib.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['font.family']='sans-serif'
matplotlib.rcParams['axes.unicode_minus']=False

#coding:utf-8

### KNN

In [2]:
def classify0(inX, dataset, labels, k):
    '''
    KNN 算法：
    给定一个训练数据集，对新的输入实例，在训练数据集中找到与该实例最邻近的K个实例，
    这K个实例的多数属于某个类，就把该输入实例分类到这个类中
    
    Args:
        inX: 输入向量
        dataset: 训练数据集
        labels: 标签向量
        k: 选择最近邻居个数
    '''
    
    # 欧几里得距离
    dist = np.sqrt(np.sum(np.power(inX - dataset, 2), axis=1))
    # 将各距离从小到大排列
    sortedDistIndices = dist.argsort()
    # 选取最近的k个实例
    k_labels = [labels[indice] for indice in sortedDistIndices[0:k]]
    # 将输入向量归属于出现次数最多的类
    label = collections.Counter(k_labels).most_common(1)[0][0]
    
    return label

### 图像信息转为数组

In [3]:
def img2vector(filename):
    '''
    将图像信息转换为数组形式
    
    Args:
        filename: 图像文件名
    Returns:
        returnVect: 存储了图像信息的数组
    '''
    
    returnVect = np.zeros((1,1024))
    # 打开文件
    fr = open(filename)
    # 读取信息
    for i in range(32):
        # 读取一行的信息
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0, 32*i+j] = int(lineStr[j])
            
    fr.close()
    return returnVect

In [4]:
# 检验
testVector = img2vector('testDigits/0_13.txt')

In [5]:
testVector[0, 0:16]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.])

### 算法测试

In [6]:
def handwritingClassTest():
    '''
    trainingDigits是KNN算法的基础数据集
    testDigits用作检验KNN算法的数据集
    k = 4
    '''
    # 初始化标签list
    Labels = []
    
    file_train = 'trainingDigits'
    trainingFileList = os.listdir(file_train)
    # 文件夹下文件数量
    m = len(trainingFileList)
    # 初始化特征矩阵
    traingMat = np.zeros((m, 1024))
    # 将每个文件的信息读入矩阵
    for i in range(m):
        fileName = trainingFileList[i]
        # 获得该文件的标签
        classNumber = int(fileName.split('.')[0].split('_')[0])
        Labels.append(classNumber)
        # 读取信息
        traingMat[i, :] = img2vector(file_train + '/' + fileName)
    
    
    # 检验
    file_test = 'testDigits'
    testFileList = os.listdir(file_test)
    # 文件数量
    n = len(testFileList)
    
    errorNum = 0.0
    for i in range(n):
        fileName = testFileList[i]
        # 文件原标签
        classNumber = int(fileName.split('.')[0].split('_')[0])
        # 信息特征
        TestVector = img2vector(file_test + '/' + fileName)
        # KNN分类
        classfierResult = classify0(TestVector, traingMat, Labels, 4)
        # 打印结果
        if (i % 100 == 0):
            print("Class predict is: %d, Real answer is: %d" % (classfierResult, classNumber))
        # 记录错误率
        if (classfierResult != classNumber):
            errorNum += 1.0
        
    print("\nThe total number of errors is: ", errorNum)
    print("\nThe accuracy is: %.2f %%" % ((1.00 - float(errorNum/n)) * 100.00))
    

In [7]:
handwritingClassTest()

Class predict is: 4, Real answer is: 4
Class predict is: 0, Real answer is: 0
Class predict is: 9, Real answer is: 9
Class predict is: 8, Real answer is: 8
Class predict is: 8, Real answer is: 8
Class predict is: 7, Real answer is: 7
Class predict is: 2, Real answer is: 2
Class predict is: 5, Real answer is: 5
Class predict is: 3, Real answer is: 3
Class predict is: 9, Real answer is: 9

The total number of errors is:  11.0

The accuracy is: 98.84 %
