In [37]:
# -*- coding=utf-8 -*-
# @Time :2021/7/17 20:33
# @Author :Lu runyu
# @File : Naive_Bayes.ipynb
# @Software : Jupyter Notebook

"""
    python3实现朴素贝叶斯分类器
    以过滤spam为例, 实现二分类器
"""

import numpy as np


class NaiveBayes:

    def __init__(self):
        self.likelihood_1 = None
        self.likelihood_0 = None
        self.p_c_0 = None
        self.p_c_1 = None
        self.tag_num = None

    def fit(self, dataset, labels):
        """
        :param dataset: dataset is an one-hot-encoding numpy array
        :param labels: corresponding tags
        :return: None
        """
        
        """
        由于这里面的要计算的条件概率和先验概率实在是很少，我们就不打算引入字典类型，我们直接每一类都用列表描述就行了。
        这里相当于：label:[y1, y2, y3, ……, yn](实际上只有0，1)
        而我们的dataset中的每一条也即：Xi:[0, 1, 0, 1, 1, ……, 1, 0], 其类别为yi\
        我们要算出P(xi|yk),也要算出条件概率P(yk)
        """

        # 首先是先验概率：
        self.likelihood_0 = np.log(labels.tolist().count(0) / len(labels))
        self.likelihood_1 = 0 - self.likelihood_0
        
        # 接下来是我们的条件概率：
        index_1 = np.where(labels==1)  # 得到labels=1的索引
        self.p_c_1 = np.sum(dataset[index_1], axis=0) / dataset[index_1].shape[0]   # 得到P(xi=1|yk=1)概率的矩阵
        p_c_1 = (np.zeros(dataset.shape[1]) + 1) - self.p_c_1     # 得到P(xi=0|yk=1)概率的矩阵
        self.p_c_1 = np.log(np.concatenate((p_c_1.reshape(1, -1), self.p_c_1.reshape(1, -1)), axis=0))
        index_0 = np.where(labels==0) # 得到labels=0的索引
        self.p_c_0 = np.sum(dataset[index_0], axis=0) / dataset[index_0].shape[0]   # 得到P(xi=1|yk=0)概率的矩阵
        p_c_0 = (np.zeros(dataset.shape[1]) + 1) - self.p_c_0     # 得到P(xi=0|yk=0)概率的矩阵
#         self.p_c_0 = np.log(np.array([].append(p_c_0).append(self.p_c_0)))
        self.p_c_0 = np.log(np.concatenate((p_c_0.reshape(1, -1), self.p_c_0.reshape(1, -1)), axis=0))
        

    def predict(self, testset):
        """

        :param testset: the dataset to be predicted(still one-hot-encoding)
        :return: an array of labels
        """

        """
        预测相对而言就比较简单了
        """
        return np.array([np.argmax(np.array([self.likelihood_0 + sum([self.p_c_0[document[i]][i] for i in range(self.p_c_0.shape[1])]), self.likelihood_1 + sum([self.p_c_1[document[i]][i] for i in range(self.p_c_1.shape[1])])])) for document in testset])
        
        
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]  # 1 is abusive, 0 not
    return postingList, classVec


def createVocabList(dataSet):
    vocabSet = set([])  # create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document)  # union of the two sets取并集
    return list(vocabSet)   # 返回的是一个一维的，不含重复单词的列表，列表中包含所有dataSet中含有的单词。相当于一个字典（不是python里的那种字典）。


def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1

    return returnVec    # 相当于一个one-hot编码


if __name__ == '__main__':

    listOPosts, listClasses = loadDataSet()
    VocabList = createVocabList(listOPosts)
    train_dataset = []
    for sentence in listOPosts:
        train_dataset.append(setOfWords2Vec(VocabList, sentence))
    train_dataset = np.array(train_dataset)
    labelset = np.array(listClasses)
    print(labelset)
    nb_clf = NaiveBayes()
    nb_clf.fit(train_dataset, labelset)
    print(nb_clf.likelihood_0)
    testset = []
    test1 = setOfWords2Vec(VocabList, ['love', 'my', 'dalmation'])
    test2 = setOfWords2Vec(VocabList, ['stupid', 'garbage'])
    testset.append(test1)
    testset.append(test2)
    testset = np.array(testset)
    result = nb_clf.predict(testset)
    print(result)



[0 1 0 1 0 1]
-0.6931471805599453
[0 1]


  self.p_c_1 = np.log(np.concatenate((p_c_1.reshape(1, -1), self.p_c_1.reshape(1, -1)), axis=0))
  self.p_c_0 = np.log(np.concatenate((p_c_0.reshape(1, -1), self.p_c_0.reshape(1, -1)), axis=0))


In [12]:
a = np.random.randint(1, 3, 10)
a

array([2, 2, 2, 1, 2, 1, 2, 1, 1, 2])

In [13]:
np.where(a==1)

(array([3, 5, 7, 8], dtype=int64),)

In [14]:
np.zeros(3)

array([0., 0., 0.])

In [21]:
b = list().append([1, 2, 3])
print(b)

None


In [35]:
a = np.array(list([1, 2, 3]))
b = np.array(([2, 3, 4]))

a = a.reshape(1, -1)
b = b.reshape(1, -1)
np.concatenate((np.array(a), np.array(b)), axis=0)

array([[1, 2, 3],
       [2, 3, 4]])