In [1]:
import pandas as pd
import math
import jieba
import tool

def trainNB(label,terms,docs,_class):
    
    # 用于计算prior
    doccnt_class = {}
    doccnt_all = 0
    prior = {}
         
    # 用于计算condprob
    wordcnt_unique = 0 # 文档集中不重复的词的数量
    wordcnt_class = {} # 文档集中每一类的词频数
    vocabulary_class_cnt = {} # 文档集中每个词在每一类的出现频率
    condprob = {} # 文档集中每个词在每一类的条件概率
    
    for c in _class:
        doccnt_class[c] = 0
        wordcnt_class[c] = 0

    for index,row in docs.iterrows():
        doccnt_class[row[label]] = doccnt_class[row[label]]+1
        doccnt_all += 1 # iterrows遍历的每一行是一个文档
        
        words = row[terms]

        for word in words:
            if word not in vocabulary_class_cnt:
                wordcnt_unique = wordcnt_unique + 1
                vocabulary_class_cnt[word] = {}
                condprob[word] = {}
                for c in _class:
                    vocabulary_class_cnt[word][c] = 0 # word 在 c 类文档中出现的词频
                    # condprob[word][c] = 0 # word 在 c 类文档中出现的条件概率
                    
            vocabulary_class_cnt[word][row[label]] = vocabulary_class_cnt[word][row[label]]+1
            wordcnt_class[row[label]] = wordcnt_class[row[label]]+1
    
    for c in _class:
        prior[c] = doccnt_class[c]/doccnt_all
        for word,word_class_cnt in vocabulary_class_cnt.items():
            condprob[word][c] = (word_class_cnt[c]+1)/(wordcnt_class[c]+wordcnt_unique)
            
    return prior,condprob
    

def predictNB(words,prior,condprob,_class):
    words = set(words)
    score = {}
    for c in _class:
        score[c] = math.log(prior[c])
        for word in words:
            if word in condprob:
                score[c] += math.log(condprob[word][c])
    
    maxNB = 0
    maxC = ''
    for c in _class:
        if score[c]>maxNB:
            maxNB = score[c]
            maxC = c
    
    return (maxC,maxNB)

def filterFeature(fea_dict,label,terms):
    return [term for term in terms if term in fea_dict[label]]

In [38]:
data = pd.read_csv('../data/gwt_info.csv')
stopwords = tool.getStopwords('../data/cn_stopwords.txt')
data['words'] = data.apply(lambda x: tool.clean_cn_str_jieba_search(str(x['title'])+' '+str(x['content']),stopwords),axis=1)

_class = data['department'].unique()
train_docs = data[data['no']<20][['department','words']]
test_docs = data[(data['no']<30) & (data['no']>=20)][['department','words']]

prior,condprob = trainNB('department','words',train_docs,_class)
test_docs['predict_nofea'] = test_docs.apply(lambda x: predictNB((x['words']),prior,condprob,_class),axis=1)
test_docs['is_correct_nofea'] = test_docs.apply(lambda x: x['department']==x['predict_nofea'][0],axis=1)

mi = pd.read_csv('../data/MI_result_jieba_search.csv')
chi = pd.read_csv('../data/CHI_result_jieba_search.csv')

mi_fea = {}
chi_fea = {}


In [39]:
for c in _class:
    mi_fea[c] = list(mi[c].apply(lambda x:eval(x)[0]))
    chi_fea[c] = list(chi[c].apply(lambda x:eval(x)[0]))

In [40]:
train_docs['words_mi'] = train_docs.apply(lambda x: filterFeature(mi_fea,x['department'],x['words']),axis=1)
train_docs['words_chi'] = train_docs.apply(lambda x: filterFeature(chi_fea,x['department'],x['words']),axis=1)

In [25]:
condprob

{'做好': {'党政办公室': 0.0016006984866123398,
  '教务部': 0.0005110841372260908,
  '招生办公室': 0.0006360109393881574,
  '研究生院': 0.0008053524965927394,
  '科学技术部': 0.0002743032696949748},
 '我校': {'党政办公室': 0.001018626309662398,
  '教务部': 0.0008305117229923976,
  '招生办公室': 0.0024168415696749984,
  '研究生院': 0.0021063065295502414,
  '科学技术部': 0.00038402457757296467},
 '2021': {'党政办公室': 0.004583818393480791,
  '教务部': 0.005046955855107647,
  '招生办公室': 0.0023532404757361826,
  '研究生院': 0.0044604138272828645,
  '科学技术部': 0.0018652622339258285},
 '-': {'党政办公室': 0.00036379511059371364,
  '教务部': 0.0022359931003641475,
  '招生办公室': 0.0008268142212046047,
  '研究生院': 0.002168256721595837,
  '科学技术部': 0.004059688391485627},
 '2022': {'党政办公室': 0.0009458672875436554,
  '教务部': 0.0024276496518239317,
  '招生办公室': 0.0009540164090822362,
  '研究生院': 0.0020443563375046464,
  '科学技术部': 0.0025784507351327627},
 '学年': {'党政办公室': 0.0002910360884749709,
  '教务部': 0.001086053791605443,
  '招生办公室': 0.000508808751510526,
  '研究生院': 0.00012390038409

In [41]:
prior_mi,condprob_mi = trainNB('department','words_mi',train_docs,_class)
test_docs['predict_mi'] = test_docs.apply(lambda x: predictNB((x['words']),prior_mi,condprob_mi,_class),axis=1)
test_docs['is_correct_mi'] = test_docs.apply(lambda x: x['department']==x['predict_mi'][0],axis=1)

In [42]:
prior_chi,condprob_chi = trainNB('department','words_chi',train_docs,_class)
test_docs['predict_chi'] = test_docs.apply(lambda x: predictNB((x['words']),prior_chi,condprob_chi,_class),axis=1)
test_docs['is_correct_chi'] = test_docs.apply(lambda x: x['department']==x['predict_chi'][0],axis=1)

In [43]:
print('No Feature accuracy : %f',test_docs['is_correct_nofea'].mean())
print('MI Feature accuracy : %f',test_docs['is_correct_mi'].mean())
print('CHI Feature accuracy : %f',test_docs['is_correct_chi'].mean())

No Feature accuracy : %f 0.84
MI Feature accuracy : %f 0.94
CHI Feature accuracy : %f 0.94


In [32]:
def predictNB(words,prior,condprob,_class):
    words = set(words)
    # print(words)
    score = {}
    for c in _class:
        score[c] = math.log(prior[c])
        for word in words:
            # print(word)
            if word in condprob:
                score[c] += math.log(condprob[word][c])
        # print(score[c])
    
    maxNB = score[_class[0]]
    maxC = ''
    for c in _class:
        if score[c]>=maxNB:
            maxNB = score[c]
            maxC = c
    
    return (maxC,maxNB)