In [1]:
# 定义词表到向量的转换函数
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                  ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                  ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                  ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                  ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                  ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1] # 1代表侮辱性文字，0代表正常言论
    return postingList,classVec

# 统计所有文档中出现的词条列表 
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

# 根据词条列表中的词条是否在文档中出现(出现1，未出现0)，将文档转化为词条向量
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [2]:
listOPosts, listClasses = loadDataSet()

In [3]:
myVocabList = createVocabList(listOPosts)

In [4]:
myVocabList

['flea',
 'garbage',
 'worthless',
 'problems',
 'stop',
 'food',
 'park',
 'cute',
 'to',
 'my',
 'how',
 'quit',
 'buying',
 'posting',
 'not',
 'take',
 'licks',
 'is',
 'dalmation',
 'I',
 'help',
 'mr',
 'maybe',
 'please',
 'has',
 'stupid',
 'dog',
 'him',
 'steak',
 'so',
 'ate',
 'love']

In [5]:
setOfWords2Vec(myVocabList, listOPosts[0])

[1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0]

In [6]:
setOfWords2Vec(myVocabList, listOPosts[3])

[0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0]

In [7]:
from numpy import *

# 朴素贝叶斯分类器训练函数
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs) # 计算侮辱性文档的概率
    # 下面两行使用拉普拉斯平滑:随机变量各个取值的频数上加1，即zeros()改为ones(),初始值改为2.0
    p0Num = ones(numWords); p1Num = ones(numWords)
    p0Denom = 2.0,; p1Denom = 2.0
    
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
            
    p1Vect = log(p1Num/p1Denom) # 为避免下溢出问题，改为log()
    p0Vect = log(p0Num/p0Denom) # 为避免下溢出问题，改为log()
    
    # 下面这段代码是我根据朴素贝叶斯公式实现的，这本书的作者写的代码有其自己的想法
    
    '''
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]

        else:
            p0Num += trainMatrix[i]

            
    p1Vect = p1Num/ (sum(trainCategory) + 2) # 为避免下溢出问题，下面改为log()
    p0Vect = p0Num/(numTrainDocs - sum(trainCategory) + 2) # 为避免下溢出问题，下面改为log()
    
    '''
    
    return p0Vect, p1Vect, pAbusive

In [8]:
# 构造训练矩阵
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

In [9]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)

In [10]:
pAb

0.5

In [11]:
p0V

array([-2.56494936, -3.25809654, -3.25809654, -2.56494936, -2.56494936,
       -3.25809654, -3.25809654, -2.56494936, -2.56494936, -1.87180218,
       -2.56494936, -3.25809654, -3.25809654, -3.25809654, -3.25809654,
       -3.25809654, -2.56494936, -2.56494936, -2.56494936, -2.56494936,
       -2.56494936, -2.56494936, -3.25809654, -2.56494936, -2.56494936,
       -3.25809654, -2.56494936, -2.15948425, -2.56494936, -2.56494936,
       -2.56494936, -2.56494936])

In [12]:
p1V

array([-3.04452244, -2.35137526, -1.94591015, -3.04452244, -2.35137526,
       -2.35137526, -2.35137526, -3.04452244, -2.35137526, -3.04452244,
       -3.04452244, -2.35137526, -2.35137526, -2.35137526, -2.35137526,
       -2.35137526, -3.04452244, -3.04452244, -3.04452244, -3.04452244,
       -3.04452244, -3.04452244, -2.35137526, -3.04452244, -3.04452244,
       -1.65822808, -1.94591015, -2.35137526, -3.04452244, -3.04452244,
       -3.04452244, -3.04452244])

In [26]:
# 朴素贝叶斯分类函数

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1=sum(vec2Classify * p1Vec) + log(pClass1)
    p0=sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    
    # 下面这段代码是我根据朴素贝叶斯公式实现的，这本书的作者写的代码有其自己的想法
    '''
    p1 = 0.0; p0 = 0.0
    for i in range(len(vec2Classify)):
        if vec2Classify[i] == 1:
            p1 += log(p1Vec[i])
            p0 += log(p0Vec[i])
        else:
            p1 += log(1 - p1Vec[i])
            p0 += log(1 - p0Vec[i])
    p1 += log(pClass1)
    p0 += log(1.0 - pClass1)
    '''
    if p1 > p0:
        return 1
    else:
        return 0
    
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'calssified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry1 = ['stupid', 'garbage']
    thisDoc1 = array(setOfWords2Vec(myVocabList, testEntry1))
    print(testEntry1, 'classified as: ', classifyNB(thisDoc1, p0V, p1V, pAb))

In [14]:
testingNB()

['love', 'my', 'dalmation'] calssified as:  0
['stupid', 'garbage'] classified as:  1


In [15]:
# 朴素贝叶斯词袋模型
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

示例：使用朴素贝叶斯过滤垃圾邮件

In [16]:
mySent = "This book is the best book on Python or M.L.I have ever laid eyes upon."

In [17]:
mySent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

In [18]:
# 利用正则表达式来切分句子
import re
regEx = re.compile(r'\W*')
listOfTokens = regEx.split(mySent)

  after removing the cwd from sys.path.


In [19]:
listOfTokens

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon',
 '']

In [20]:
[tok.lower() for tok in listOfTokens if len(tok)>0]

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [21]:
emailText = open('email/ham/6.txt').read()

In [22]:
listOfTokens = regEx.split(emailText)

  """Entry point for launching an IPython kernel.


In [23]:
listOfTokens

['Hello',
 'Since',
 'you',
 'are',
 'an',
 'owner',
 'of',
 'at',
 'least',
 'one',
 'Google',
 'Groups',
 'group',
 'that',
 'uses',
 'the',
 'customized',
 'welcome',
 'message',
 'pages',
 'or',
 'files',
 'we',
 'are',
 'writing',
 'to',
 'inform',
 'you',
 'that',
 'we',
 'will',
 'no',
 'longer',
 'be',
 'supporting',
 'these',
 'features',
 'starting',
 'February',
 '2011',
 'We',
 'made',
 'this',
 'decision',
 'so',
 'that',
 'we',
 'can',
 'focus',
 'on',
 'improving',
 'the',
 'core',
 'functionalities',
 'of',
 'Google',
 'Groups',
 'mailing',
 'lists',
 'and',
 'forum',
 'discussions',
 'Instead',
 'of',
 'these',
 'features',
 'we',
 'encourage',
 'you',
 'to',
 'use',
 'products',
 'that',
 'are',
 'designed',
 'specifically',
 'for',
 'file',
 'storage',
 'and',
 'page',
 'creation',
 'such',
 'as',
 'Google',
 'Docs',
 'and',
 'Google',
 'Sites',
 'For',
 'example',
 'you',
 'can',
 'easily',
 'create',
 'your',
 'pages',
 'on',
 'Google',
 'Sites',
 'and',
 'share',


In [27]:
# 文件解析及完整的垃圾邮件测试函数
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]
    
def spamTest():
    docList = []; classList = []; fullText = []
    
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    vocabList = createVocabList(docList)
    
    # 留存交叉验证，随机选择十个样本作为测试集
    trainingSet = list(range(50)); testSet = [] # 不加list()会报错：'range' object doesn't support item deletion
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet))) # 在（0，50）均匀分布中随机取个数，再取整（取小数点之前的数字）
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex]) # 从整数列表中删除选出的数，防止下次再次选出
    
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    
    print("the error rate is: ", float(errorCount)/len(testSet))  

In [29]:
spamTest()

the error rate is:  0.0


  return _compile(pattern, flags).split(string, maxsplit)


示例：使用朴素贝叶斯分类器从个人广告中获取区域倾向

In [30]:
import feedparser

In [45]:
# 书上RSS源访问不了，自己换了两个RSS源
# 所有来自于nasa的文章将会被分类为1，所有来自于yahoo sports的拳击新闻将会分类为0
ny = feedparser.parse('https://www.nasa.gov/rss/dyn/image_of_the_day.rss')
sf = feedparser.parse('https://sports.yahoo.com/boxing/rss.xml')

In [46]:
len(ny['entries'])

60

In [47]:
len(sf['entries'])

47

In [48]:
ny['entries']

[{'guidislink': False,
  'id': 'http://www.nasa.gov/image-feature/technology-then-and-now',
  'link': 'http://www.nasa.gov/image-feature/technology-then-and-now',
  'links': [{'href': 'http://www.nasa.gov/image-feature/technology-then-and-now',
    'rel': 'alternate',
    'type': 'text/html'},
   {'href': 'http://www.nasa.gov/sites/default/files/thumbnails/image/29177231106_5186f7024e_o.jpg',
    'length': '1132329',
    'rel': 'enclosure',
    'type': 'image/jpeg'}],
  'published': 'Fri, 30 Mar 2018 10:49 EDT',
  'published_parsed': time.struct_time(tm_year=2018, tm_mon=3, tm_mday=30, tm_hour=14, tm_min=49, tm_sec=0, tm_wday=4, tm_yday=89, tm_isdst=0),
  'source': {'href': 'http://www.nasa.gov/rss/dyn/image_of_the_day.rss',
   'title': 'NASA Image of the Day'},
  'summary': 'Before there were computers and software that could stitch together digital images, they were printed on photo paper, trimmed by hand, and taped in place on a large black board.',
  'summary_detail': {'base': 'htt

In [67]:
# RSS源分类器及高频词去除函数

# 下面注释部分是选取频数前三十的单词进行去除
#def calcMostFreq(vocabList, fullText):
#   import operator
#    freqDict = {}
#    for token in vocabList:
#        freqDict[token] = fullText.count(token)
#    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
#    return sortedFreq[: 30]

# 用停用词表进行高频词去除
def stopWords():
    import re
    wordList =  open('stopwords.txt').read() # see http://www.ranks.nl/stopwords
    listOfTokens = re.split(r'\W*', wordList)
    return [tok.lower() for tok in listOfTokens] 
    print ('read stop word from \'stopword.txt\':',listOfTokens)
    return listOfTokens

def localWords(feed1, feed0):
    import feedparser
    docList = []; classList = []; fullText =[]
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    vocabList = createVocabList(docList)
    
    # 用停用词表进行高频词去除
    stopWordList = stopWords()
    for stopWord in stopWordList:
        if stopWord in vocabList:
            vocabList.remove(stopWord)
            
    # 以下注释部分是选取频数前三十的单词进行去除
    #top30Words = calcMostFreq(vocabList, fullText)
    
    #for pairW in top30Words:
        #if pairW[0] in vocabList: vocabList.remove(pairW[0])
            
    trainingSet = list(range(2*minLen)); testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
        
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
        
    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
    
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ',float(errorCount)/len(testSet))
    
    return vocabList, p0V, p1V

In [77]:
vocabList, pSF, pNY = localWords(ny, sf) # 利用停用词表能有效降低错误率

the error rate is:  0.0


  return _compile(pattern, flags).split(string, maxsplit)


In [78]:
# 最具表征性的词汇显示函数
def getTopWords(ny, sf):
    import operator
    vocabList, p0V, p1V = localWords(ny, sf)
    topNY = []; topSF = []
    for i in range(len(p0V)):
        if p0V[i] > -6.0: topSF.append((vocabList[i], p0V[i]))
        if p1V[i] > -6.0: topNY.append((vocabList[i], p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair:pair[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF*SF")
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY, key=lambda pair:pair[1], reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY")
    for item in sortedNY:
        print(item[0])

In [79]:
getTopWords(ny, sf)

the error rate is:  0.1
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF*SF
joshua
world
parker
boxing
title
heavyweight
joseph
anthony
bout
alvarez
champion
saturday
will
fight
unification
canelo
new
year
ramirez
wbo
cardiff
test
golovkin
wba
suspended
gennady
one
may
pounds
said
ibf
reuters
two
positive
says
unbeaten
middleweight
nevada
imam
training
drug
boxer
doping
putting
international
tuesday
friday
next
ahead
jose
vacant
former
rematch
right
like
mayweather
temporarily
amir
first
fighter
king
wilder
federation
failed
britain
meet
super
vegas
opponent
undefeated
deontay
rival
british
zealand
round
bob
commission
champions
last
old
000
london
terence
win
association
las
scales
council
yet
arum
tests
murray
jeopardy
briton
time
wbc
failures
showtime
now
ekpo
zealander
end
crawford
still
elbow
meat
believes
clenbuterol
weigh
battle
shields
construction
bennett
stopped
native
reporters
taking
crowd
recent
hand
women
arena
respect
banned
tipped
plans
come
big
following
champ
five
a

  return _compile(pattern, flags).split(string, maxsplit)
