本文是《机器学习实战》朴素贝叶斯过滤垃圾邮件练习。

使用的数据集包含50封邮件，其中25封为垃圾邮件。训练集40封，测试集10封。

In [3]:
import re

# 读取邮件数据
def loadEmailFile(filename):
    fr = open(filename)
    
    # 使用正则表达式进行分词
    regEx = re.compile('\\W*')
    wordList = regEx.split(fr.read())
    return [tok.lower() for tok in wordList if len(tok) > 2]

打印出来看看效果：

In [5]:
wordList = loadEmailFile('email/spam/8.txt')
wordList

['you',
 'have',
 'everything',
 'gain',
 'incredib1e',
 'gains',
 'length',
 'inches',
 'yourpenis',
 'permanantly',
 'amazing',
 'increase',
 'thickness',
 'yourpenis',
 'betterejacu1ation',
 'control',
 'experience',
 'rock',
 'harderecetions',
 'explosive',
 'intenseorgasns',
 'increase',
 'volume',
 'ofejacu1ate',
 'doctor',
 'designed',
 'and',
 'endorsed',
 '100',
 'herbal',
 '100',
 'natural',
 '100',
 'safe']

在算法训练和测试过程中，每个文件都需要使用该函数读取并返回词条向量

In [6]:
# 创建词汇表
def createVocabList(docList):
    # 初始化集合
    vocabSet = set([])
    
    # 遍历docList，提取所有出现过的单词
    for document in docList:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

In [7]:
from numpy import *

# 创建朴素贝叶斯词袋模型
def bagOfWords2VecMN(vocabList,inputSet):
    # 初始化词向量，每个元素对应词汇表中的一个单词，初始化为0
    returnVec = [0]*len(vocabList)
    
    # 遍历输入的邮件数据，每遇到一个词，词向量中对应的值加1
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

朴素贝叶斯算法是一个比较简单的，基于概率论的分类算法

In [8]:
# 朴素贝叶斯算法训练函数
def trainNB0(trainMatrix,trainCategory):
    # 文档数量
    numTrainDocs = len(trainMatrix)
    # 数据集中的词汇量
    numWords = len(trainMatrix[0])
    
    # 初始化
    # 计算垃圾邮件在邮件总数中出现的概率
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = ones(numWords);p1Num = ones(numWords)  # 应该是用平滑
    p0Denom = 2.0;p1Denom = 2.0
    
    # 遍历每篇文档
    for i in range(numTrainDocs):
        # 判断是否是垃圾邮件
        if trainCategory[i] == 1:
            # 如果是垃圾邮件
            # 向量加法增加每个单词在p1Num向量中出现的频次,每个词都加
            # 的1，应该是用了平滑
            p1Num += trainMatrix[i]
            # 增加垃圾邮件所有词条的总计数
            p1Denom += sum(trainMatrix[i])
        else:
            # 如果是正常邮件
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    
    # 返回每个词条在垃圾邮件中出现的概率向量
    p1Vec = log(p1Num/p1Denom)
    
    # 返回每个词条在正常邮件中出现的概率向量
    p0Vec = log(p0Num/p0Denom)
    
    return p0Vec,p1Vec,pAbusive

算法返回值可以应用在对新输入的邮件进行分类，分类器的实现如下，
输入为要分类的邮件的词袋，以及训练算法返回的三个数值：

In [9]:
# 分类器
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    # 按照朴素贝叶斯算法公式计算概率
    p1 = sum(vec2Classify*p1Vec)+log(pClass1)
    p0 = sum(vec2Classify*p0Vec)+log(pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

测试

In [32]:
# 垃圾邮件朴素贝叶斯分类器测试
def spamTest():
    # 初始化邮件词汇列表，分类向量，全文列表等
    docList=[];classList=[];fullText=[]
    
    # 遍历读取所有的邮件文件
    for i in range(1,26):
        # 读取所有的垃圾文件
        wordList = loadEmailFile('email/spam/%d.txt' % i)
        # 注意python列表中append和extend区别
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        # 读取所有正常文件
        wordList = loadEmailFile('email/ham/%d.txt' % i)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    # 获取单词表
    vocabList = createVocabList(docList)
    
    # 初始化训练数据集和测试数据集
    trainingSet = range(50);testSet = []
    for i in range(10):
        # 从中随机提取10封作为测试集
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    
    # 构建训练算法所需要的输入参数
    trainMat=[];trainClasses=[]
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    
    # 执行训练算法，获得概率向量
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    
    # 进行分类器测试
    errorCount = 0
    
    # 循环读取测试邮件
    predicted = []
    for docIndex in testSet:
        # 获得词袋
        wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
        predict = classifyNB(array(wordVector),p0V,p1V,pSpam)
        # 分类后与实际类别进行对比
        if predict != classList[docIndex]:
            errorCount += 1
            print "classification error",docList[docIndex]
        predicted.append(predict)
    
    # 打印错误率
    print "the error rate is: ",float(errorCount)/len(testSet)
    print predicted

In [58]:
from sklearn.naive_bayes import GaussianNB

# 用sklearn里的贝叶斯
def sk_spamTest():
    # 初始化邮件词汇列表，分类向量，全文列表等
    docList=[];classList=[];fullText=[]
    
    # 遍历读取所有的邮件文件
    for i in range(1,26):
        # 读取所有的垃圾文件
        wordList = loadEmailFile('email/spam/%d.txt' % i)
        # 注意python列表中append和extend区别
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        # 读取所有正常文件
        wordList = loadEmailFile('email/ham/%d.txt' % i)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    # 获取单词表
    vocabList = createVocabList(docList)
    
    # 初始化训练数据集和测试数据集
    trainingSet = range(50);testSet = []
    for i in range(10):
        # 从中随机提取10封作为测试集
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    
    # 构建训练算法所需要的输入参数
    trainMat=[];trainClasses=[]
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    
    # 执行训练算法，获得概率向量
#     p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    model = GaussianNB()
    model.fit(array(trainMat),array(trainClasses))
    
    testMat = [];testClasses=[]
    for docIndex in testSet:
        testMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
        testClasses.append(classList[docIndex])
        
    
    predicted = model.predict(array(testMat))
    
    # 进行分类器测试
    errorCount = 0
    for docIndex in testClasses:
#         print docIndex
        if predicted[docIndex]!=testClasses[docIndex]:
            errorCount += 1
        
    print "the error rate is: ",float(errorCount)/len(testClasses)
    print testClasses
    print predicted

In [59]:
spamTest()

classification error ['experience', 'with', 'biggerpenis', 'today', 'grow', 'inches', 'more', 'the', 'safest', 'most', 'effective', 'methods', 'of_penisen1argement', 'save', 'your', 'time', 'and', 'money', 'bettererections', 'with', 'effective', 'ma1eenhancement', 'products', 'ma1eenhancement', 'supplement', 'trusted', 'millions', 'buy', 'today']
the error rate is:  0.1
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]


In [63]:
sk_spamTest()

the error rate is:  0.0
[0, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[0 1 0 1 1 0 0 0 0 0]
