# Naive Bayes Classifier for Text Classification
- 朴素贝叶斯分类运用到的两个关键定义：条件独立假设和贝叶斯定理
- 本实验分别运用伯努利朴素贝叶斯和多项式朴素贝叶斯的方法分类
- 可参考网页 https://www.jianshu.com/p/b6cadf53b8b8 


## Packages

In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB

## Text Classification
- Input: document D
- Output: the predicted class C

## Bag of Words 
- In this task, we use BOW to represent documents, either frequency or boolean variable
- Use frequency as feature:<img src='frequency.png'>
- Use boolean variable as feature:<img src='boolean.png'>

In [3]:
# 通常我们选取文本中具有代表性的部分词语来描述文本

# 根据已有的数据集，取出部分如下词语组成词典，其余词语用'UNKNOWN'表示
vocabulary = ['love', 'wonderful', 'best', 'great', 'superb', 'still', 'beautiful', 
              'bad', 'worst', 'stupid', 'waste', 'boring','UNKNOWN']

# 分词，将大段文本的各词语分隔开形成列表
def text_parse(big_string):
    list_of_tokens = re.split(r'\W',big_string)  
    return [tok.lower() for tok in list_of_tokens if len(tok) > 2] 


########################## Question 1 ########################## 

#分别补充完整transfer(fileDj, vocabulary)和transfer_bern(fileDj, vocabulary)函数
#tranfer将文本表示为以frequency计数的向量，transfer_bern将文本表示为以0或1计数的向量
#其中fileDj是单个文本经由函数text_parse拆分后的词的列表，返回值BOWDj为单个文本的向量表示
#对于vocabulary中的单词love,需将其loves,loving,loved三种形式统一转换为love
#注：这两个函数将用在下文的数据处理中，可先查看Data Loading部分帮助理解

############################################################### 


from nltk.stem import PorterStemmer

def transfer(fileDj, vocabulary):
    BOWDj = [0] * len(vocabulary)
    stemmer = PorterStemmer()
    
    ##################### Start For Your Code ##################### 
    for word in fileDj:
        stemmed_word = stemmer.stem(word)
        if stemmed_word in vocabulary:
            index = vocabulary.index(stemmed_word)
            BOWDj[index] += 1
    #####################  End For Your Code  ##################### 

    return BOWDj

def transfer_bern(fileDj, vocabulary):
    BOWDj = [0] * len(vocabulary)
    stemmer = PorterStemmer()
    
    ##################### Start For Your Code ##################### 
    for word in fileDj:
        stemmed_word = stemmer.stem(word)
        if stemmed_word in vocabulary:
            index = vocabulary.index(stemmed_word)
            BOWDj[index] = 1
    #####################  End For Your Code  ##################### 
            
    return BOWDj




## Data Loading
- Loading training and testing data respectively
- You can look up the data in `data_sets_naive_bayes/data_sets` file

In [4]:
# 用于存储经text_parse处理后的所有文本
doc_train = []
doc_test = []

# 用于存储经transfer处理后的所有文本
vec_train = []
vec_test = []

# 用于存储经transfer_bern处理后的所有文本
vec_train_bern = []
vec_test_bern = []

# 用于存储类别
y_train = []
y_test = []

# 分别读取训练集和测试集中好评和差评文件的路径
path1 = 'data_sets_naive_bayes/data_sets/training_set/pos'
path2 = 'data_sets_naive_bayes/data_sets/training_set/neg'
path3 = 'data_sets_naive_bayes/data_sets/test_set/pos'
path4 = 'data_sets_naive_bayes/data_sets/test_set/neg'

# 读取训练集

file_list1 = os.listdir(path1)

for file in file_list1:
    fo = open(path1 + '/' + file, encoding='utf-8', mode='r')
    wordlist = text_parse(fo.read())
    doc_train.append(wordlist)
    y_train.append(1)
    
file_list2 = os.listdir(path2)

for file in file_list2:
    fo = open(path2 + '/' + file, encoding='utf-8', mode='r')
    wordlist = text_parse(fo.read())
    doc_train.append(wordlist)
    y_train.append(0)
    
# 用等长向量来表示文本    
for doc in doc_train:
    vec_train_bern.append(transfer_bern(doc,vocabulary))
    vec_train.append(transfer(doc,vocabulary))

    
# 读取测试集

file_list3 = os.listdir(path3)

for file in file_list3:
    fo = open(path3 + '/' + file, encoding='utf-8', mode='r')
    wordlist = text_parse(fo.read())
    doc_test.append(wordlist)
    y_test.append(1)
    
file_list4 = os.listdir(path4)

for file in file_list4:
    fo = open(path4 + '/' + file, encoding='utf-8', mode='r')
    wordlist = text_parse(fo.read())
    doc_test.append(wordlist)
    y_test.append(0)

# 用等长向量来表示文本
for doc in doc_test:
    vec_test_bern.append(transfer_bern(doc,vocabulary))
    vec_test.append(transfer(doc,vocabulary))
    
# 将数据集转化为ndarray
# 训练集
train_data = np.array(vec_train)
train_data_bern = np.array(vec_train_bern)
train_label = np.array(y_train)
# 测试集
test_data = np.array(vec_test)
test_data_bern = np.array(vec_test_bern)
test_label = np.array(y_test)
print('以测试集为例展示数据集\n')
print('Using frequency as feature:\n')
print(test_data)
print('Using boolean variable as feature:\n')
print(test_data_bern)
print('label:')
print(test_label)

以测试集为例展示数据集

Using frequency as feature:

[[1 0 4 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 2 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [3 0 1 ... 0 0 0]]
Using boolean variable as feature:

[[1 0 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [1 0 1 ... 0 0 0]]
label:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


## Multinomial vs Multivariate Bernoulli Naive Bayes Classifier
- 已知贝叶斯公式：
$$ P(c|d)=\frac{P(c)P(d|c)}{p(d)}\propto P(c)P(d|c)$$
- 因此，要求最大的$P(c_i|d)$进而得到类别，应求出：
$$argmaxP(c_i)P(d|c_i)$$
- 在Multivariate Bernoulli Naive Bayes Classifier中，使用boolean variable：
$$P(D=d|C=c_i)=P(W_1=true,W_2=false,...,W_k=true|C=c_i)$$
- 在Multinomial Naive Bayes Classifier中，使用frequency：
$$P(D=d|C=c_i)=P(W_1=n_1,W_2=n_2,...,W_k=n_k|C=c_i)$$
- 其中$c$代表类别，$d$代表文件，$W_i$代表词典中的词语

## BNBC
- 条件独立假设(naive)
$$ P(D|C)=P(W_1,W_2,...,W_k|C)=P(W_1|C)P(W_2|C)...P(W_k|C) $$

- 因此取对数后：
$$ c=argmax\{logP(c_j)+\sum logp(w_i|c_j)\} $$
- 其中$w_i$表示$W_i$的取值
- 在本数据集中
$$P(c=1)=P(c=0)=\frac{1}{2}$$
- 因此需求出$p(w_i|c_j)$的值，给分子分母添加一个常数以smoothing
$$P(w_i=true|c_j)=\frac{files\ which\ include\ x_i\ and\ are\ in\ class\ c_j+1}{files\ are\ in\ class\ c_j+2}$$
$$P(w_i=false|c_j)=1-P(w_i=true|c_j)$$

In [5]:
########################## Question 2 ########################## 

#利用上述公式补充完整naiveBayesBernFeature_test(Xtest, ytest, thetaPosTrue, thetaNegTrue)函数
#得到预测值和准确率
#naiveBayesBernFeature_train(Xtrain, ytrain)已经分别求出P(w_i=true|0)和P(w_i=true|1)

############################################################### 

# 分别求出P(w_i=true|0)和P(w_i=true|1)
def naiveBayesBernFeature_train(Xtrain, ytrain): 
    
    length = len(Xtrain[0])
    vec_1 = np.ones(length)
    vec_0 = np.ones(length)
    num_1, num_0 = 2, 2
    for i, data in enumerate(Xtrain):
        if ytrain[i] == 1:  
            vec_1 += data     
            num_1 += 1  
        else:
            vec_0 += data
            num_0 += 1
    thetaPosTrue = vec_1/num_1  # 类别1的条件概率
    thetaNegTrue = vec_0/num_0  #  类别0的条件概率
    return thetaPosTrue, thetaNegTrue


def naiveBayesBernFeature_test(Xtest, ytest, thetaPosTrue, thetaNegTrue):
    yPredict = [0] * len(Xtest)

    for i, test_vec in enumerate(Xtest):
        ##################### Start For Your Code #####################
        prob_1 = np.sum(test_vec * np.log(thetaPosTrue) + (1 - test_vec) * np.log(1 - thetaPosTrue))
        prob_0 = np.sum(test_vec * np.log(thetaNegTrue) + (1 - test_vec) * np.log(1 - thetaNegTrue))

        if prob_1 >= prob_0:
            yPredict[i] = 1
        else:
            yPredict[i] = 0
        #####################  End For Your Code  #####################

    corr = 0
    for i, label in enumerate(yPredict):
        if label == ytest[i]:
            corr += 1
    Accuracy = corr / len(yPredict)

    return yPredict, Accuracy



thetaPosTrue, thetaNegTrue = naiveBayesBernFeature_train(train_data_bern, train_label)
print("thetaPosTrue =", thetaPosTrue)
print("thetaNegTrue =", thetaNegTrue)
print("--------------------")

yPredict, Accuracy = naiveBayesBernFeature_test(test_data_bern, test_label, thetaPosTrue, thetaNegTrue)
print("BNBC classification accuracy =", Accuracy)

thetaPosTrue = [0.44017094 0.0014245  0.5042735  0.42877493 0.06552707 0.37179487
 0.0014245  0.26495726 0.04558405 0.03988604 0.0014245  0.0014245
 0.0014245 ]
thetaNegTrue = [0.35754986 0.0014245  0.36324786 0.26923077 0.01851852 0.33475783
 0.0014245  0.5042735  0.19230769 0.18233618 0.0014245  0.0014245
 0.0014245 ]
--------------------
BNBC classification accuracy = 0.67


## MNBC
- 假定$P(W_1,W_2,...,W_k|C)$是一个多项式分布
- 文档分类中的$k$维字典对应于多项式分布中的向量的$k$个维度
$$ P(W_1=n_1,...,W_k=n_k|c,N,\theta_{1,c},...,\theta_{k,c})=\frac{N!}{n_1!n_2!...n_k!}\theta_{1,c}^{n_1}\theta_{2,c}^{n_2}...\theta_{k,c}^{n_k} $$
- 其中
$$ \sum_{i=1}^kn_i=N, \sum_{i=1}^k\theta_{i,c}=1 $$ 
- $\theta_{i,c}$对应于$w_i$在类别$c$中出现的概率，即需求出的值
- 省略阶乘，并取对数，可以得到
$$c=argmax\{logP(c_j)+\sum n_ilogp(w_i|c_j)\}$$
- After smoothing:

$$ P(w_i|c_j)=\frac{n_{i,j}+\alpha}{n_j+\alpha|vocabulary|}$$
- 其中，$n_j$为文本$Text_j$（类别为$c_j$的文本）的长度，$n_{i,j}$为$w_i$在$Text_j$中出现的次数
- 且$P(c=1)=P(c=0)=\frac{1}{2}, \alpha=1$

In [6]:
########################## Question 3 ########################## 

#补充完整naiveBayesMulFeature_train(Xtrain, ytrain)函数
#注意MNBC与BNBC求P(w_i|c_j)的差别即可

############################################################### 

def naiveBayesMulFeature_train(Xtrain, ytrain):
    length = len(Xtrain[0])
    vec_1 = np.ones(length)
    vec_0 = np.ones(length)
    num_1, num_0 = length, length
    
    ##################### Start For Your Code ##################### 

    length = len(Xtrain[0])
    vec_1 = np.ones(length)
    vec_0 = np.ones(length)
    num_1, num_0 = length, length
    
    for i, data in enumerate(Xtrain):
        if ytrain[i] == 1:  
            vec_1 += data     
            num_1 += sum(data)  
        else:
            vec_0 += data
            num_0 += sum(data)

    #####################  End For Your Code  ##################### 
            
    thetaPos = vec_1/num_1
    thetaNeg = vec_0/num_0
    return thetaPos, thetaNeg


########################## Question 4 ########################## 

#利用上述公式补充完整naiveBayesMulFeature_test(Xtest, ytest,thetaPos, thetaNeg)函数
#得到预测值和准确率

############################################################### 


def naiveBayesMulFeature_test(Xtest, ytest,thetaPos, thetaNeg):    
 
    yPredict = [0]*len(Xtest)  # 先假设样本是一个全为0的向量
    
    ##################### Start For Your Code ##################### 
    for i, test_vec in enumerate(Xtest):
        pos_prob = 1
        neg_prob = 1
        for j, val in enumerate(test_vec):
            pos_prob *= thetaPos[j] ** val
            neg_prob *= thetaNeg[j] ** val
        
        if pos_prob > neg_prob:
            yPredict[i] = 1
    #####################  End For Your Code  ####################
            
    corr = 0
    for i, label in enumerate(yPredict):
        if label == ytest[i]:
            corr = corr+1
    Accuracy = corr/len(yPredict)    
    return yPredict, Accuracy

########################## Question 5 ########################## 

#直接调用sklearn.naive_bayes.MultinomialNB()进行多项式朴素贝叶斯分类
#得到准确率

############################################################### 

def naiveBayesMulFeature_sk_MNBC(Xtrain, ytrain, Xtest, ytest):
    
    ##################### Start For Your Code ##################### 
    classifier = MultinomialNB()
    classifier.fit(Xtrain, ytrain)
    y_pred = classifier.predict(Xtest)
    #####################  End For Your Code  ##################### 
    
    corr = 0
    for i in range(0, len(y_pred)):
        if y_pred[i] == ytest[i]:
            corr = corr+1
    Accuracy = corr/len(y_pred)    
    return Accuracy


thetaPos, thetaNeg = naiveBayesMulFeature_train(train_data, train_label)
print("thetaPos =", thetaPos)
print("thetaNeg =", thetaNeg)
print("--------------------")

yPredict, Accuracy = naiveBayesMulFeature_test(test_data, test_label, thetaPos, thetaNeg)
print("MNBC classification accuracy =", Accuracy)
print("--------------------")

Accuracy_sk = naiveBayesMulFeature_sk_MNBC(train_data, train_label, test_data, test_label)
print("Sklearn MultinomialNB accuracy =", Accuracy_sk)

thetaPos = [0.23350254 0.00039047 0.23428348 0.22100742 0.02030457 0.15657946
 0.00039047 0.10542757 0.01327606 0.01366654 0.00039047 0.00039047
 0.00039047]
thetaNeg = [0.16808424 0.00040502 0.14661806 0.10692588 0.00526529 0.1381126
 0.00040502 0.29040097 0.07290401 0.06966383 0.00040502 0.00040502
 0.00040502]
--------------------
MNBC classification accuracy = 0.6683333333333333
--------------------
Sklearn MultinomialNB accuracy = 0.6683333333333333
