In [68]:
import numpy as np
import math

In [8]:
def create_dataset():
    '''
    file_list中每一行表示每一个文档，并且每一行的大小不同
    '''
    file_list = [['my','dog','has','flea','problems','help','please'],
                ['maybe','not','take','him','to','dog','park','stupid'],
                ['my','dalmation','is','so','cute','I','love','him'],
                ['stop','posting','stupid','worthless','garbage'],
                ['my','licks','ate','my','steak','how','to','stop','him'],
                ['quit','buying','worthless','dog','food','stupid']]
    #类标签
    file_labels = [0, 1, 0, 1, 0, 1]
    return file_list, file_labels

In [31]:
def createVocalList(file_list):
    '''
    求文档中的词条，即出现过的词
    '''
    vocal_list = set([])
    for doc in file_list:
        vocal_list = vocal_list|set(doc)  #求两个set集合的并集
    return list(vocal_list)

In [85]:
def setword(vocal_list, inputset):
    '''
    判断文档中是否出现过该词，1指出现过，0没有出现过
    '''
    returnvec = len(vocal_list)*[0]
    for vec in inputset:
        if vec in vocal_list:
            #出现过就是1
            returnvec[vocal_list.index(vec)] = 1
    return returnvec

In [86]:
#vocal_list是词条列表
file_list, file_labels = create_dataset()
vocal_list = createVocalList(file_list)

In [87]:
len(vocal_list)

31

In [88]:
#词条向量，及文档中出现就是1，没有出现就是0,每个文档都有一个词条向量
vec = setword(vocal_list, file_list[0])
trainMatrix = []
for i in range(len(file_list)):
    vec = setword(vocal_list, file_list[i])
    trainMatrix.append(vec)
#每篇文档的vec
#类标签，及file_labels

In [89]:
def tranNB(trainMatrix, trainCategory):
    '''
    trainMatrix表示一整个list中每个文档出现的的词条
    trainCategory每一篇文档的类标签
    
    '''
    #num_doc表示文档的数目，例如6
    num_doc = len(trainMatrix)
    #num_wor表示每个词条的长度，例如31
    num_wor = len(trainMatrix[0])
    #文档中类1所占的比例p(c=1)
    pc = sum(trainCategory)/float(num_doc)
    
    p0_num = np.zeros(num_wor)
    p1_num = np.zeros(num_wor)
    p0demo = 0.0
    p1demo = 0.0
    
    #遍历每一个文档
    for i in range(num_doc):
        #判断文档类别是否为1
        if(trainCategory[i] == 1):
            #类别为1的文档,放在p1_num里面
            p1_num += trainMatrix[i]
            #计算词条总数
            p1demo += sum(trainMatrix[i])
        else:
            p0_num += trainMatrix[i]
            p0demo += sum(trainMatrix[i])
    #类别为1的条数 和 词的数目 的商
    p1vect = p1_num / p1demo   
    p0vect = p0_num / p0demo
    return p0vect, p1vect, pc

In [90]:
tranNB(trainMatrix, file_labels)

(array([0.        , 0.04347826, 0.04347826, 0.04347826, 0.        ,
        0.        , 0.13043478, 0.04347826, 0.        , 0.04347826,
        0.04347826, 0.        , 0.        , 0.04347826, 0.04347826,
        0.        , 0.04347826, 0.04347826, 0.08695652, 0.        ,
        0.        , 0.        , 0.04347826, 0.04347826, 0.04347826,
        0.04347826, 0.04347826, 0.04347826, 0.04347826, 0.        ,
        0.04347826]),
 array([0.10526316, 0.10526316, 0.        , 0.        , 0.05263158,
        0.05263158, 0.        , 0.        , 0.05263158, 0.05263158,
        0.        , 0.05263158, 0.05263158, 0.        , 0.        ,
        0.05263158, 0.        , 0.        , 0.05263158, 0.05263158,
        0.05263158, 0.15789474, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.05263158, 0.05263158,
        0.        ]),
 0.5)

In [91]:
def classifyNB(vecClassify, p0vect, p1vect, pc):
    '''
    vecClassify:待分类的词条
    p0vect:类别0的文档中词条出现的频数p(w0|c0)
    p1vect:类别0的文档中词条出现的频数p(w1|c1)
    pc:类别为1的文档的比例
    '''
    p1 = sum(vecClassify*p1vect) + math.log(pc)   #这是什么
    p0 = sum(vecClassify*p0vect) + math.log(1.0 - pc)
    if p1 > p0:
        return 1
    else:
        return 0

In [92]:
def testNB():
    #文档和标签
    file_list, file_labels = create_dataset()
    #词条
    vocal_list = createVocalList(file_list)
    #词条矩阵
    train_mat = []
    for doc in file_list:
        doc_vocal = setword(vocal_list, doc)
        train_mat.append(doc_vocal)
    p0vect, p1vect, pc = tranNB(np.array(train_mat), np.array(file_labels))
    
    #测试文档
    testfile = ['love','my','dalmation']
    #将测试文档转为词条
    test_doc_vocal = np.array(setword(vocal_list, testfile))
    print(testfile, "is", classifyNB(test_doc_vocal,p0vect, p1vect, pc))
    
    #测试文档2
    testfile2 = ['stupid','garbage']
    test_doc_vocal2 = np.array(setword(vocal_list, testfile2))
    print(testfile2,"is", classifyNB(test_doc_vocal2, p0vect, p1vect, pc))

In [93]:
testNB()

['love', 'my', 'dalmation'] is 0
['stupid', 'garbage'] is 1
