In [1]:
import numpy as np
import pandas as pd
import feedparser
from MyNaviebayes import MyNavieBayes
import re
import operator

## 词集模型，使用朴素贝叶斯模型进行学习和预测

In [2]:
#postingList中每个一维列表表示一个文档，通过对文档进行分词得到的单词列表；
#classVec用于表示每个文档的标签，比如文档表示一条评论，那么标签为1就是表示差评，为0表示好评
postingList = [['my', 'dog', 'has', 'fea','problems', 'help', 'please'],
              ['maybe', 'not', 'take', 'him', 'to','dog','park','stupid'],
              ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love','him'],
              ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
              ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
               ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec=[0, 1, 0, 1, 0, 1] #1表示侮辱性单词

In [3]:
#对输入的所有单词列表进行合并去重，得到一个单词词典
def createVocabList(dataset):
    """
    创建一个包含所有文档中的不重复词列表
    """
    vocabSet = set([]) #创建一个空集
    for document in dataset:
        vocabSet = vocabSet | set(document) #求并集
    return list(vocabSet)

In [4]:
#把原始的单词列表，变成只含有0,1的新列表；
#新列表中的0表示对应于单词词典中的单词在原始单词列表中不存在，1表示对应于单词词典中的单词在原始单词列表中存在
def setOfWords2Vec(vocabList, inputSet):
    """
    此模型为词集模型，即只考虑单词存在与否的情况，不忽略了单词出现的频率信息
    输入:词汇表和单个文档特征
    返回:文档向量
    """
    returnVec = [0]*len(vocabList) #创建一个所有元素为0的数组
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print('the word: %s is not in my Vocabulary!' % word)
    return returnVec #返回新列表，这是最终传递给朴素贝叶斯模型的输入特征向量

In [5]:
#测试创建的朴素贝叶斯模型，即使用学习好的贝叶斯模型进行预测
def testingNB(NBmodel,X,y):
    myVocabList = createVocabList(X)
    trainMat = []
    for postinDoc in X:
         trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

    #模型学习
    p0V, p1V, pAb = NBmodel.fit(np.array(trainMat), np.array(y))
    #使用学习好的模型进行测试
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', NBmodel.predict(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', NBmodel.predict(thisDoc, p0V, p1V, pAb))

In [6]:
nb = MyNavieBayes()
testingNB(nb, postingList, classVec)

(['love', 'my', 'dalmation'], 'classified as: ', 0)
(['stupid', 'garbage'], 'classified as: ', 1)


## 词袋模型，使用朴素贝叶斯模型进行学习和预测

In [7]:

def bagOfWords2VecMN(vocabList, inputSet):
    """
    词袋模型,考虑原始单词列表中单词出现的频率，如果不存在该单词，出现频率为0
    """
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec #这是最终传递给朴素贝叶斯模型的输入特征向量

In [8]:
def calcMostFreq(vocabList, fullText):
    """
    统计fullText单词列表中，包含的vocabList单词对应的出现次数问多少
    等同于bagOfWords2VecMN函数
    """
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedFreq[:100] #返回出现频率最高的前100个词

In [9]:
#简单的句子（文档）分词函数
def textParse(bigString):
    listOfTokens = re.split(r'\W', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

In [10]:
def localWords(NBmodel, feed1, feed0):
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary']) #每次访问一条RSS源
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList) #生成单词词典
    top100Words = calcMostFreq(vocabList, fullText) #返回前100的高频词，高频词通常是那些结构词，比如the，and，a 等等
    for pairW in top100Words: #(单词，出现次数)
        if pairW[0] in vocabList: vocabList.remove(pairW[0])#删掉高频词
            
    #分别保存训练集和测试集的数据索引
    trainingSet = range(2*minLen)
    testSet = []
    #随机从训练集中选出20个样本作为测试集，剩余的作为训练集
    for i in range(20):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    #根据数据索引把数据集分成训练集和测试集
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    #开始使用朴素贝叶斯模型进行学习
    p0V, p1V, pSpam = NBmodel.fit(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if NBmodel.predict(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount)/len(testSet)
    return vocabList, p0V, p1V

In [11]:
def getTopWords(self, ny, sf):
    vocabList, p0V, p1V = self.localWords(ny, sf)
    topNY = []
    topSF = []
    for i in range(len(p0V)):
        if p0V[i] > -6.0: topSF.append((vocabList[i], p0V[i]))
        if p1V[i] > -6.0: topNY.append((vocabList[i], p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sortedNY:
        print(item[0])

In [12]:
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')

In [13]:
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')

In [14]:
nb = MyNavieBayes()
vocabList, pSF, pNY =localWords(nb, ny, sf)

the error rate is:  0.3
