In [33]:
import numpy as np
import pandas as pd
import jieba
from functools import reduce

## 加载数据集

In [2]:
text = ["你是一个愚蠢的笨蛋",
        "我的狗有跳蚤，请帮助我",
        "它也许不会去狗狗公园，笨蛋",
        "我的玩偶也太可爱了，我爱它",
        "请停止粘贴这些愚蠢且无价值的垃圾",
        "那位先生正在吃我的牛排，如何去阻止他",
        "请停止购买无价值的狗粮，笨蛋"]

In [3]:
def loadDataSet(text):
    dataset = []
    for i in text:
        temp = list(jieba.cut(i, cut_all=False))
        for j in temp:
            if j == '，':
                temp.remove(j)
        dataset.append(temp)
    classvec = [1, 0, 1, 0, 1, 0, 1]
    return dataset, classvec

In [4]:
dataset, classvec = loadDataSet(text)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/ck/5sxp2x1n4d534l6w14v0mvyr0000gn/T/jieba.cache
Loading model cost 0.872 seconds.
Prefix dict has been built successfully.


In [5]:
dataset

[['你', '是', '一个', '愚蠢', '的', '笨蛋'],
 ['我', '的', '狗', '有', '跳蚤', '请', '帮助', '我'],
 ['它', '也许', '不会', '去', '狗狗', '公园', '笨蛋'],
 ['我', '的', '玩偶', '也', '太', '可爱', '了', '我', '爱', '它'],
 ['请', '停止', '粘贴', '这些', '愚蠢', '且', '无', '价值', '的', '垃圾'],
 ['那位', '先生', '正在', '吃', '我', '的', '牛排', '如何', '去', '阻止', '他'],
 ['请', '停止', '购买', '无', '价值', '的', '狗', '粮', '笨蛋']]

In [6]:
classvec

[1, 0, 1, 0, 1, 0, 1]

## 创建词列表

In [7]:
def createVocabList(dataset):
    vocabset = set()
    for doc in dataset:
        vocabset |= set(doc)
    vocablist = list(vocabset)
    return vocablist

In [8]:
vocablist = createVocabList(dataset)
print(vocablist)

['的', '也', '我', '价值', '可爱', '粘贴', '购买', '垃圾', '狗狗', '请', '如何', '先生', '狗', '跳蚤', '他', '牛排', '一个', '不会', '玩偶', '也许', '去', '吃', '帮助', '无', '公园', '这些', '那位', '正在', '了', '是', '愚蠢', '你', '阻止', '它', '爱', '有', '笨蛋', '停止', '太', '且', '粮']


## 创建词向量（one-hot）

In [9]:
def word2vec(vocablist, inputset):
    returnvec = [0] * len(vocablist)
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)] = 1
        else:
            print(f"{word}不在词汇表中")
    return returnvec

In [10]:
returnvec = word2vec(vocablist, ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'])
print(returnvec)

my不在词汇表中
dog不在词汇表中
has不在词汇表中
flea不在词汇表中
problems不在词汇表中
help不在词汇表中
please不在词汇表中
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## 获取所有词向量

In [11]:
def getTrainMat(dataset):
    trainmat = []
    vocablist = createVocabList(dataset)
    for inputset in dataset:
        returnvec = word2vec(vocablist, inputset)
        trainmat.append(returnvec)
    return trainmat

In [12]:
trainmat = getTrainMat(dataset)
print(trainmat)

[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0], [1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1]]


In [13]:
len(trainmat)

7

In [14]:
len(trainmat[0])

41

## 训练朴素贝叶斯
pAb是侮辱性词汇所占的比重，为1/2

In [15]:
def trainNB(trainmat, classvec):
    # 训练集长度
    n = len(trainmat)
    # 词向量长度
    m = len(trainmat[0])
    # 正例所占比重
    pAb = sum(classvec) / n
    # 出现在正例和反例中的所有词汇的向量初始化
    p0Num = np.ones(m)
    p1Num = np.ones(m)
    # 正例和反例分别除以出现词汇的总数的初始化
    p0Denom = (n * pAb)
    p1Denom = 1 - (n * pAb)
    for i in range(n):
        if classvec[i] == 1:
            # 将所有属于正例的词向量按位数相加，即可得到出现在正例中的所有词汇以及该词汇所出现的次数
            p1Num += trainmat[i]
            # 将所有属于正例的词向量内的元素相加，即可得到一个出现在所有正例词汇的总数
            p1Denom += sum(trainmat[i])
        else:
            # 反例同上
            p0Num += trainmat[i]
            p0Denom += sum(trainmat[i])
    # 用正例累加的所有词向量除以正例所有词向量所出现的词汇的总和既是每次词汇所属于正例的概率
    p1v = np.log(p1Num / p1Denom)
    p0v = np.log(p0Num / p0Denom)
    return p0v, p1v, pAb

In [16]:
p0v, p1v, pAb = trainNB(trainmat, classvec)

In [17]:
print(vocablist)

['的', '也', '我', '价值', '可爱', '粘贴', '购买', '垃圾', '狗狗', '请', '如何', '先生', '狗', '跳蚤', '他', '牛排', '一个', '不会', '玩偶', '也许', '去', '吃', '帮助', '无', '公园', '这些', '那位', '正在', '了', '是', '愚蠢', '你', '阻止', '它', '爱', '有', '笨蛋', '停止', '太', '且', '粮']


In [18]:
p0v

array([-2.04769284, -2.74084002, -2.04769284, -3.4339872 , -2.74084002,
       -3.4339872 , -3.4339872 , -3.4339872 , -3.4339872 , -2.74084002,
       -2.74084002, -2.74084002, -2.74084002, -2.74084002, -2.74084002,
       -2.74084002, -3.4339872 , -3.4339872 , -2.74084002, -3.4339872 ,
       -2.74084002, -2.74084002, -2.74084002, -3.4339872 , -3.4339872 ,
       -3.4339872 , -2.74084002, -2.74084002, -2.74084002, -3.4339872 ,
       -3.4339872 , -3.4339872 , -2.74084002, -2.74084002, -2.74084002,
       -2.74084002, -3.4339872 , -3.4339872 , -2.74084002, -3.4339872 ,
       -3.4339872 ])

In [19]:
p1v

array([-1.98100147, -3.36729583, -3.36729583, -2.26868354, -3.36729583,
       -2.67414865, -2.67414865, -2.67414865, -2.67414865, -2.26868354,
       -3.36729583, -3.36729583, -2.67414865, -3.36729583, -3.36729583,
       -3.36729583, -2.67414865, -2.67414865, -3.36729583, -2.67414865,
       -2.67414865, -3.36729583, -3.36729583, -2.26868354, -2.67414865,
       -2.67414865, -3.36729583, -3.36729583, -3.36729583, -2.67414865,
       -2.26868354, -2.67414865, -3.36729583, -2.67414865, -3.36729583,
       -3.36729583, -1.98100147, -2.26868354, -3.36729583, -2.67414865,
       -2.67414865])

## 套用朴素贝叶斯公式
这里的**vec2Classify * p1v**意思就是：在是侮辱性语言的条件概率下，**vec2Classify**的概率。而整个reduce的意思就是朴素贝叶斯里的条件独立性的体现，比如$P\left(数学好\ 英语不好\ 代码弱 \mid 是\right)=P\left(数学好\mid 是\right)P\left(英语不好\mid 是\right)P\left(代码弱\mid 是\right)$

In [20]:
def classifyNB(vec2Classify, p0v, p1v, pAb):
    p1 = reduce(lambda x, y: x * y, vec2Classify * p1v) * pAb
    p0 = reduce(lambda x, y: x * y, vec2Classify * p0v) * (1 - pAb)
    return 1 if p1 > p0 else 0

In [21]:
def classifyNB_1(vec2Classify, p0v, p1v, pAb):
    p1 = sum(vec2Classify * p1v) + np.log(pAb)
    p0 = sum(vec2Classify * p0v) + np.log(1 - pAb)
    return 1 if p1 > p0 else 0

In [22]:
def testingNB(testvec):
    dataset, classvec = loadDataSet(text)
    vocablist = createVocabList(dataset)
    trainmat = getTrainMat(dataset)
    p0v, p1v, pAb = trainNB(trainmat, classvec)
    test = word2vec(vocablist, testvec)
    if classifyNB_1(test, p0v, p1v, pAb) == 1:
        print(testvec, "属于侮辱性句子")
    else:
        print(testvec, "属于非侮辱性句子")

In [31]:
testvec1 = ['我', '爱', '玩偶']
testingNB(testvec1)

['我', '爱', '玩偶'] 属于非侮辱性句子


In [32]:
testvec2 = ['我', '爱', '狗']
testingNB(testvec2)

['我', '爱', '狗'] 属于非侮辱性句子
