### 4.5.1 准备数据：从文本中构建词向量
#### 程序清单4-1 词表到向量的转换函数

In [1]:
def loadDataSet():
    postingList=[
        'my dog has flea problems help please'.split(),
        'maybe not take him to dog park stupid'.split(),
        'my dalmation is so cute i love him'.split(),
        'stop posting stupid worthless garbage'.split(),
        'mr licks ate my steak how to stop him'.split(),
        'quit buying worthless dog food stupid'.split(),
    ]
    classVec=[0,1,0,1,0,1]
    return postingList,classVec


def createVocabList(dataSet):
    """返回dataSet中所有unique的单词"""
    vocabSet=set([])
    for document in dataSet:
        vocabSet=vocabSet|set(document)
    return list(vocabSet)


def setOfWords2Vec(vocabList,inputSet):
    """将inputSet(单个样本)转化为one-hot形式,特征为vocabList,取值为0,1"""
    returnVec=[0]*len(vocabList)
    for vec in vocabList:
        if vec in inputSet:
            returnVec[vocabList.index(vec)]=1
    return returnVec

In [2]:
# 测试一下以上函数的效果
listOPosts,listClasses=loadDataSet()
listOPosts

[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
 ['my', 'dalmation', 'is', 'so', 'cute', 'i', 'love', 'him'],
 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]

In [3]:
myVocabList=createVocabList(listOPosts)
len(myVocabList)

32

In [4]:
setOfWords2Vec(myVocabList,listOPosts[0])

[1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0]

### 4.5.2 训练算法：从词向量计算概率
#### 计算每个类别的条件概率
#### 程序清单4-2 朴素贝叶斯分类器训练函数

In [5]:
import numpy as np


def trainNB0(trainMatrix, trainCategory):
    """trainMatrix:one-hot类型,自变量X
    trainCategory::因变量y"""
    num = len(trainMatrix)    # 样本数量
    numWords = len(trainMatrix[0])    # X(one-hot)特征个数
    pAbusive = sum(trainCategory)/float(num)    # 侮辱性样本比例，即：p(1)
    p0Num = np.zeros(numWords)
    p1Num = np.zeros(numWords)
    p0Denom = 0.0
    p1Denom = 0.0

    for i in range(num):
        if trainCategory[i] == 1:    # 第一类（侮辱类评论）
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:                        # 第二类（非侮辱类评论）
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom

    return p0Vect, p1Vect, pAbusive
# 注意这里返回的p0Vect的含义是：
# p(wi|c1) for i in range(numWords)    # numWords:特征个数

# 注意这里返回的p1Vect的含义是：
# p(wi|c0) for i in range(numWords)    # numWords:特征个数


#### 有两个问题需要解决：
- 有一个特征对应的概率为零，累乘将导致最终的结果为0。解决方法：分子分母都加1
- 若干个较小的数相乘，导致最后的结果过小，造成计算机下溢出。解决方法：取对数
#### 则上述代码改为：

In [6]:
def trainNB0(trainMatrix, trainCategory):
    """trainMatrix:one-hot类型,自变量X
    trainCategory::因变量y"""
    num = len(trainMatrix)    # 样本数量
    numWords = len(trainMatrix[0])    # X(one-hot)特征个数
    pAbusive = sum(trainCategory)/float(num)    # 侮辱性样本比例，即：p(1)
    p0Num = np.ones(numWords)    # 分子分母都加1
    p1Num = np.ones(numWords)    # 分子分母都加1
    p0Denom =1.0
    p1Denom = 1.0    # 这几行代码都做了更改，分子分母都加1，防止出现0的情况

    for i in range(num):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)
    p0Vect = np.log(p0Num/p0Denom)    # 取对数，防止计算机下溢出

    return p0Vect, p1Vect, pAbusive


### 4.5.3 测试算法：
#### 程序清单4-3 朴素贝叶斯分类函数

In [7]:
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    """
    vec2Classify:待测样本特征w
    p0Vec:p(w|c0)
    p1Vec:p(w|c1)
    pClass1:p(c1)
    """
    p1=np.sum(vec2Classify*p1Vec)+np.log(pClass1)    # 两项都取对数，由原来的乘变成了加（第一项在上面的函数中取过对数了）
    p0=np.sum(vec2Classify*p0Vec)+np.log(1-pClass1) #
    if p1>p0:
        return 1
    else:
        return 0
    

def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)    # 创建词汇表集合
    # one-hot 形式的转化
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
        
    # 训练集的训练
    p0V,p1V,pAb=trainNB0(trainMat,np.array(listClasses))
    # 测试数据
    testEntry=['love','my','dalmation']
    thisDoc=np.array(setOfWords2Vec(myVocabList,testEntry))    # 将测试样本转换为one-hot形式
    print(testEntry,'is classified as : ',classifyNB(thisDoc,p0V,p1V,pAb))
    
    # 换一条测试样本在测试
    testEntry=['stupid','parbage']
    thisDoc=np.array(setOfWords2Vec(myVocabList,testEntry))    # 将测试样本转换为one-hot形式
    print(testEntry,'is classified as : ',classifyNB(thisDoc,p0V,p1V,pAb))
    
    
testingNB()


['love', 'my', 'dalmation'] is classified as :  0
['stupid', 'parbage'] is classified as :  1


### 4.5.4 准备数据：文档词袋模型
函数setOfWords2Vec()的改进版，对于重复出现的词，会将其频数置为多次，而不仅仅是one-hot的0和1
#### 程序清单4-4 朴素贝叶斯词袋模型

In [8]:
def bagOfWordsVecMN(vocabList,inputSet):
    """考虑样本中重复的词"""
    returnVec=[0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]+=1
            
    return returnVec

SyntaxError: invalid syntax (Temp/ipykernel_32728/3696656995.py, line 1)