### 这是朴素贝叶斯分类器的一个最著名的应用

In [7]:

import numpy as np

# 之前定义好的函数，直接复制过来
def trainNB0(trainMatrix, trainCategory):
    """trainMatrix:one-hot类型,自变量X
    trainCategory::因变量y"""
    num = len(trainMatrix)    # 样本数量
    numWords = len(trainMatrix[0])    # X(one-hot)特征个数
    pAbusive = sum(trainCategory)/float(num)    # 侮辱性样本比例，即：p(1)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 1.0
    p1Denom = 1.0    # 这几行代码都做了更改，分子分母都加1，防止出现0的情况

    for i in range(num):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)
    p0Vect = np.log(p0Num/p0Denom)    # 取对数，防止计算机下溢出

    return p0Vect, p1Vect, pAbusive


def createVocabList(dataSet):
    """返回dataSet中所有unique的单词"""
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):
    """将inputSet(单个样本)转化为one-hot形式,特征为vocabList,取值为0,1"""
    returnVec = [0]*len(vocabList)
    for vec in vocabList:
        if vec in inputSet:
            returnVec[vocabList.index(vec)] = 1
    return returnVec


def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    # 两项都取对数，由原来的乘变成了加（第一项在上面的函数中取过对数了）
    p1 = np.sum(vec2Classify*p1Vec)+np.log(pClass1)
    p0 = np.sum(vec2Classify*p0Vec)+np.log(1-pClass1)
    if p1 > p0:
        return 1
    else:
        return 0


### 4.6.2 测试算法：使用朴素贝叶斯进行交叉验证
#### 程序清单4-5 文件解析及完整的垃圾邮件测试函数

In [8]:
def textParse(bigString):
    """将长字符串解析为单词列表"""
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]


def spamTest():
    docList = []
    classList = []
    fullList = []

    for i in range(1, 26):
        # 正例
        wordList = textParse(
            open('D:\\机器学习实战代码\\machinelearninginaction\\Ch04\\email\\spam\\%d.txt' % i).read())
        docList.append(wordList)
        fullList.extend(wordList)
        classList.append(1)
        # 反例
        wordList = textParse(
            open('D:\\机器学习实战代码\\machinelearninginaction\\Ch04\\email\\ham\\%d.txt' % i,
                 encoding="ISO-8859-1").read())
        docList.append(wordList)
        fullList.extend(wordList)
        classList.append(1)

    vocabList = createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    # 随机构建训练集和测试集
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])    # 测试集
        del(trainingSet[randIndex])    # 将抽出去的测试集从trainingSet中删除，余下的作为训练集

    trainMat = []    # 训练数据的矩阵，为one-hot
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(
            vocabList, docList[docIndex]))    # 加入一行one-hot
        trainClasses.append(classList[docIndex])

    # 训练
    p0V, p1V, pSam = trainNB0(np.array(trainMat), np.array(trainClasses))

    # 测试
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(np.array(wordList), p0V, p1V, pSam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount)/len(testSet))


spamTest()


the error rate is:  0.0


  p0 = np.sum(vec2Classify*p0Vec)+np.log(1-pClass1)
