该实验是使用朴素贝叶斯从个人广告中获取区域倾向

In [1]:
import feedparser

# 获取Cargslist网站数据
ny = feedparser.parse('http://newyork.craiglist.org/search/stp?format=rss')
sf = feedparser.parse('http://sfbay.craiglist.org/search/stp?format=rss')


In [2]:
len(ny['entries'])

25

In [4]:
len(sf['entries'])

25

In [7]:
sf['entries'][0]['summary']

u"Hello Craigslist world, \nThanks for looking at my post. Why would anybody click on a headline like that is beyond me... Let's face it, we all have issues one way or another. It's not a question wether we are crazy or not, but how crazy are we. \nAnywa [...]"

获取完数据之后，开始处理数据。我们需要的是广告内容的摘要，需要将这些文本处理成词条向量。

In [8]:
import re

# 读取RSS数据
def loadFromRSS(feed):
    data = feed['summary']
    
    # 使用正则表达式进行分词
    regEx = re.compile('\\W*')
    wordList = regEx.split(data)
    return [tok.lower() for tok in wordList if len(tok) > 2]

In [9]:
# 可以变为词条向量了
loadFromRSS(sf['entries'][0])

[u'hello',
 u'craigslist',
 u'world',
 u'thanks',
 u'for',
 u'looking',
 u'post',
 u'why',
 u'would',
 u'anybody',
 u'click',
 u'headline',
 u'like',
 u'that',
 u'beyond',
 u'let',
 u'face',
 u'all',
 u'have',
 u'issues',
 u'one',
 u'way',
 u'another',
 u'not',
 u'question',
 u'wether',
 u'are',
 u'crazy',
 u'not',
 u'but',
 u'how',
 u'crazy',
 u'are',
 u'anywa']

In [10]:
# 创建词汇表
def createVocabList(docList):
    # 初始化集合
    vocabSet = set([])
    
    # 遍历docList,提取所有出现过的单词
    for document in docList:
        # 集合操作
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

In [12]:
from numpy import *

# 创建朴素贝叶斯词袋模型
def bagOfWords2VecMN(vocabList,inputSet):
    # 初始化词向量，每个元素对应词汇表中的一个单词
    returnVec = [0]*len(vocabList)
    
    # 遍历输入的广告，每遇到一个词，词向量中对应的值加1
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

训练算法

In [14]:
# 朴素贝叶斯算法训练函数
def trainNB0(trainMatrix,trainCategory):
    # 文档数量
    numTrainDocs = len(trainMatrix)
    
    # 数据集中的词汇量
    numWords = len(trainMatrix[0])
    
    # 初始化
    # 计算类别1在文档总数中出现的概率,因为二分类，只有类别1是1，类别0是0
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = ones(numWords);p1Num = ones(numWords) # 用了拉普拉斯平滑
    p0Denom = 2.0;p1Denom = 2.0
    
    # 遍历每篇文档
    for i in range(numTrainDocs):
        # 判断是否是类别1
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            # 增加类别1所有词条的总计数
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    
    # 返回每个词条在类别1中出现的概率向量（取log）
    p1Vec = log(p1Num/p1Denom)
    
    # 返回类别0中出现的概率向量
    p0Vec = log(p0Num/p0Denom)
    
    return p0Vec,p1Vec,pAbusive

In [15]:
# 分类器
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    # 朴素贝叶斯公式,pClass1是类别1在文档总数的概率，也就是先验概率
    
    p1 = sum(vec2Classify*p1Vec) + log(pClass1)
    p0 = sum(vec2Classify*p0Vec) + log(1.0-pClass1)
    if p1>p0:
        return 1
    else:
        return 0

测试算法


测试的步骤：

1. 读取两个城市的RSS广告信息
2. 去掉出现频次最高的30个词汇
3. 从中随机提取10则广告作为测试集，其他的为训练集
4. 构造训练算法的输入矩阵和类型向量
5. 执行训练算法得到概率向量
6. 循环读取测试集中的广告，对广告执行分类操作并判断是否正确
7. 打印错误率

In [18]:
# 计算出现频次最高的30个词
def calcMostFreq(vocabList,fullText):
    import operator
    
    # 存储词频的字典
    freqDict = {}
    
    # 遍历词汇表中的每一个单词
    for token in vocabList:
        # 在全文列表中查找词出现的频次
        freqDict[token] = fullText.count(token)
    
    # 对词频字典进行倒序排列
    sortedFreq = sorted(freqDict.iteritems(),key=operator.itemgetter(1),reverse=True)
    
    # 返回
    return sortedFreq[:30]

In [19]:
# 广告信息朴素贝叶斯分类器测试
def localWords(feed1,feed0):
    import feedparser
    
    #初始化词汇列表，分类向量，全文列表
    docList=[];classList=[];fullText=[]
    
    # 获取广告数量
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    
    # 循环读取两个城市的广告数据
    for i in range(minLen):
        # 读取城市1的所有广告
        wordList = loadFromRSS(feed1['entries'][i])
        docList.append(wordList)
        fullText.extend(wordList)
        # 城市1的标签
        classList.append(1)
        
        # 读取城市0的所有广告
        wordList = loadFromRSS(feed0['entries'][i])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    # 获取单词表
    vocabList = createVocabList(docList)
    
    # 去除高频词汇
    top30Words = calcMostFreq(vocabList,fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    
    # 初始化训练数据集合测试数据集
    trainingSet = range(2*minLen);testSet=[]
    for i in range(10):
        # 从中随机提取10个作为测试集
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    
    # 构建训练算法所需要的输入参数
    trainMat = [];trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    
    # 执行训练算法，获得概率向量
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    
    # 进行分类测试
    errorCount = 0
    
    for docIndex in testSet:
        # 获得词袋
        wordVec = bagOfWords2VecMN(vocabList,docList[docIndex])
        # 分类后
        if classifyNB(array(wordVec),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    
    print "the error rate is: ",float(errorCount)/len(testSet)

In [20]:
localWords(ny,sf)

TypeError: len() takes exactly one argument (2 given)