# 4.5 使用python进行文本分类

## 4-1 词表到向量的转换函数

In [1]:
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]    #1 is abusive, 0 not
    return postingList, classVec

def createVocabList(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    """
    词集模型
    """
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1 #对应位置为1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [2]:
listOPosts,listClasses=loadDataSet()

In [3]:
myVocabList=createVocabList(listOPosts)

In [4]:
myVocabList

['please',
 'to',
 'cute',
 'so',
 'take',
 'love',
 'not',
 'stop',
 'how',
 'problems',
 'ate',
 'him',
 'food',
 'park',
 'dog',
 'maybe',
 'has',
 'stupid',
 'garbage',
 'is',
 'worthless',
 'licks',
 'posting',
 'dalmation',
 'my',
 'quit',
 'help',
 'buying',
 'steak',
 'flea',
 'I',
 'mr']

In [5]:
setOfWords2Vec(myVocabList,listOPosts[0])

[1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0]

In [6]:
setOfWords2Vec(myVocabList,listOPosts[3])

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

## 4-2 朴素贝叶斯分类器训练函数

In [7]:
import numpy as np

In [8]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = np.zeros(numWords)
    p1Num = np.zeros(numWords)
    p0Denom = 0.0
    p1Denom = 0.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1: #侮辱性的
            p1Num += trainMatrix[i] #该类别下各个单词数+1
            p1Denom += sum(trainMatrix[i]) #该类别总数+1
        else: #不是侮辱性的
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num / p1Denom  # 
    p0Vect = p0Num / p0Denom  # 
    return p0Vect, p1Vect, pAbusive

In [9]:
trainMat=[]
for positionDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList,positionDoc))

In [10]:
p0V,p1V,pAb=trainNB0(trainMat,listClasses)

In [11]:
pAb

0.5

In [12]:
p0V

array([0.04166667, 0.04166667, 0.04166667, 0.04166667, 0.        ,
       0.04166667, 0.        , 0.04166667, 0.04166667, 0.04166667,
       0.04166667, 0.08333333, 0.        , 0.        , 0.04166667,
       0.        , 0.04166667, 0.        , 0.        , 0.04166667,
       0.        , 0.04166667, 0.        , 0.04166667, 0.125     ,
       0.        , 0.04166667, 0.        , 0.04166667, 0.04166667,
       0.04166667, 0.04166667])

In [13]:
p1V

array([0.        , 0.05263158, 0.        , 0.        , 0.05263158,
       0.        , 0.05263158, 0.05263158, 0.        , 0.        ,
       0.        , 0.05263158, 0.05263158, 0.05263158, 0.10526316,
       0.05263158, 0.        , 0.15789474, 0.05263158, 0.        ,
       0.10526316, 0.        , 0.05263158, 0.        , 0.        ,
       0.05263158, 0.        , 0.05263158, 0.        , 0.        ,
       0.        , 0.        ])

In [14]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    #防止出现概率为0的情况
    p0Num = np.ones(numWords); p1Num = np.ones(numWords)      #change to np.ones()
    p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    #防止数值太小，下溢出
    p1Vect = np.log(p1Num/p1Denom)          #change to np.log()
    p0Vect = np.log(p0Num/p0Denom)          #change to np.log()
    return p0Vect, p1Vect, pAbusive

## 4-3 朴素贝叶斯分类函数

In [15]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    #改为对数相加
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
#     print('p1 is{},p0 is {}'.format(p1,p0))
    if p1 > p0:
        return 1
    else:
        return 0

In [16]:
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))

In [17]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


个人测试代码

In [18]:
test_entry=['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him']
thisDoc = np.array(setOfWords2Vec(myVocabList, test_entry))
print(test_entry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))

['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'] classified as:  0


## 4-4 朴素贝叶斯词袋模型

In [19]:
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

# 4.6 使用朴素贝叶斯过滤垃圾邮件

In [20]:
mySent='This book is the best book on Python or M.L. I have ever laid eyes upon.'

In [21]:
mySent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

In [22]:
import re

In [23]:
regEx=re.compile('\\W*')#除了单词数字

In [24]:
listOfTokens=regEx.split(mySent)
listOfTokens

  """Entry point for launching an IPython kernel.


['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon',
 '']

In [25]:
[tok.lower() for tok in listOfTokens if len(tok)>0]

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [26]:
emailText=open('email/ham/6.txt').read()

In [27]:
listOfTokens=regEx.split(emailText)

  """Entry point for launching an IPython kernel.


In [28]:
#listOfTokens #查看结果

## 4-5 文件解析及完整垃圾邮件测试函数

In [76]:
def textParse(bigString):    #input is big string, #output is word list
    import re
    listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

In [34]:
for _ in range(10):
    spamTest()

the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0


# 4.7 使用朴素贝叶斯分类器从个人广告中获取区域倾向

In [35]:
import feedparser

In [71]:
ny=feedparser.parse('http://www.hup.harvard.edu/hup_rss.php?new=n')

In [72]:
len(ny['entries'])

20

In [66]:
ny['entries'][0]

{'title': 'Tomorrow, the World: The Birth of U.S. Global Supremacy',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'https://www.hup.harvard.edu/hup_rss.php?new=n',
  'value': 'Tomorrow, the World: The Birth of U.S. Global Supremacy'},
 'links': [{'rel': 'alternate',
   'type': 'text/html',
   'href': 'https://www.hup.harvard.edu/catalog.php?isbn=9780674271135'}],
 'link': 'https://www.hup.harvard.edu/catalog.php?isbn=9780674271135',
 'summary': 'Wertheim, Stephen<br />PAPERBACK<br />May 2022<br /><br /><img src="https://www.hup.harvard.edu/images/jackets/9780674271135.jpg" /><br /><p>How did the United States appoint itself as the world&rsquo;s supreme military power? <b>Stephen Wertheim</b> delves into the archives of the U.S. foreign policy elite to trace armed dominance to its origin in World War II. He shows how officials and intellectuals suddenly chose to embrace perpetual dominance&mdash;at the price of perpetual war.</p>',
 'summary_detail': {'type': 'te

In [77]:
def calcMostFreq(vocabList, fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
    return sortedFreq[:30]

def textParse(bigString):    #input is big string, #output is word list
    import jieba
    listOfTokens = jieba.cut(bigString, cut_all = False) 
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def localWords(feed1, feed0):
    import feedparser
    docList = []; classList = []; fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    top30Words = calcMostFreq(vocabList, fullText)   #remove top 30 words
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet = range(2*minLen); testSet = []           #create test set
    for i in range(20):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(list(trainingSet)[randIndex])
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount)/len(testSet))
    return vocabList, p0V, p1V

In [78]:
pengpai=feedparser.parse('https://feedx.net/rss/thepaper.xml') #澎湃新闻
jihe=feedparser.parse('http://www.gcores.com/rss') #机核网
vocabList, p0V, p1V=localWords(pengpai,jihe)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\19035\AppData\Local\Temp\jieba.cache
Loading model cost 0.917 seconds.
Prefix dict has been built successfully.


the error rate is:  0.0


In [79]:
def getTopWords(ny, sf):
    import operator
    vocabList, p0V, p1V = localWords(ny, sf)
    topNY = []; topSF = []
    for i in range(len(p0V)):
        if p0V[i] > -6.0: topSF.append((vocabList[i], p0V[i]))
        if p1V[i] > -6.0: topNY.append((vocabList[i], p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sortedNY:
        print(item[0])

In [80]:
getTopWords(pengpai,jihe)

the error rate is:  0.0
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
png
figcaption
克劳德
斯特尔
italic
安格隆
犬之力
switch
fill
公元前
626
292
任天堂
全高约
古代人
boss
blockquote
2022
魔法师
伊甸园
t239
steam
奥斯卡
库尔干
NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**
2022
thepaper
记录器
黄浦区
嘉定区
app
居住地
松江区
发布会
实验室
青浦区
imagecloud
feedx
一公局
186
杨浦区
流行病学
影像学
指挥部
普陀区
width
100
600
mu5735
