# 手写数字识别(One vs All多分类)

* 需要识别的数字是32*32像素的黑白图像

* trainData大约2000个样本，testData大约是900个测试数据。

### 将图像文本数据转换为向量

In [39]:
import numpy as np
import operator#运算符模块
from os import listdir
from collections import Counter

In [40]:
def img2vector(filename):
    returnVet = np.zeros((1,1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVet[0,32*i+j] = int(lineStr[j])
    return returnVet

In [41]:
testVector = img2vector('testDigits/0_13.txt')
print(testVector[0,0:32])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]


In [42]:
#计算距离
def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    #距离度量 度量公式为欧氏距离
    diffMat = np.tile(inX, (dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    
    #将距离排序：从小到大
    sortedDistIndicies = distances.argsort()
    #选取前K个最短距离， 选取这K个中最多的分类类别
    classCount={}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

### 分类

In [43]:
def handwritingClass():
    #导入训练数据
    hwlabels = []
    trainingFileList = listdir('trainingDigits')
    m = len(trainingFileList)
    trainMat = np.zeros((m,1024))
    #hwlabels存储0-9对应的index位置，trainMat存储每个位置对应的图片向量
    for i in range(m):
        fileName = trainingFileList[i]
        fileStr = fileName.split('.')[0]#获取每一个样本,去掉.txt
        classNum = int(fileStr.split('_')[0])#去掉_,获取每个样本前的数字
        hwlabels.append(classNum)
        #32*32矩阵转换为1*1024
        trainMat[i,:] = img2vector('trainingDigits/%s' % fileName)
        
    #导入测试数据
    testFileList = listdir('testDigits')  # iterate through the test set
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]  # take off .txt
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' % fileName)
        classifierResult = classify0(vectorUnderTest, trainMat, hwlabels, 3)
        print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNum))
        if (classifierResult != classNum): errorCount += 1.0
    print ("\nthe total number of errors is: %d" % errorCount)
    print ("\nthe total error rate is: %f" % (errorCount / float(mTest)))

In [45]:
handwritingClass()