# Project 4, Naïve Bayes classifier Implementation

# 1, Implmenting the naïve bayes classifier without using sk-learn.
# 2, Implementation need support both discrete and continous features.
# 3, classifier evaluation required.
# 4, Compare your implementation with Sk-learn regressor.

# 连续型数据

## 1、数据集载入，划分训练集与测试集

In [1]:
import numpy as np
import pandas as pd
iris = pd.read_csv('IrisData.csv')
def splitData(data_list, ratio):
    train_size = int(len(data_list)*ratio)
    np.random.shuffle(data_list)
    train_set = data_list[:train_size]
    test_set = data_list[train_size:]
    return train_set,test_set
data_list = np.array(iris).tolist()
trainset,testset = splitData(data_list,ratio = 0.7)
print('Split {0} samples into {1} train and {2} test samples '.format(len(data_list), len(trainset), len(testset)))

Split 150 samples into 105 train and 45 test samples 


## 2、计算先验概率

In [2]:
def seprateByClass(dataset):
  seprate_dict = {}
  info_dict = {}
  for vector in dataset:
      if vector[-1] not in seprate_dict:
          seprate_dict[vector[-1]] = []
          info_dict[vector[-1]] = 0
      seprate_dict[vector[-1]].append(vector)
      info_dict[vector[-1]] +=1
  return seprate_dict,info_dict
#两个返回值分别为划分好的数据字典，以及划分好的数据集中每个类别的样本数
train_separated,train_info = seprateByClass(trainset)
print(train_info)

{'virginica': 36, 'setosa': 34, 'versicolor': 35}


In [3]:
#计算属于每个类别的先验概率

def calulateClassPriorProb(dataset,dataset_info):
  dataset_prior_prob = {}
  sample_sum = len(dataset)
  for class_value, sample_nums in dataset_info.items():
      dataset_prior_prob[class_value] = sample_nums/float(sample_sum)
  return dataset_prior_prob

prior_prob = calulateClassPriorProb(trainset,train_info)
prior_prob

{'virginica': 0.34285714285714286,
 'setosa': 0.3238095238095238,
 'versicolor': 0.3333333333333333}

## 3、计算类条件概率

In [4]:
import math
# 均值
def mean(list):
  list = [float(x) for x in list] #字符串转数字
  return sum(list)/float(len(list))
# 方差
def var(list):
  list = [float(x) for x in list]
  avg = mean(list)
  var = sum([math.pow((x-avg),2) for x in list])/float(len(list)-1)
  return var
# 概率密度函数
def calculateProb(x,mean,var):
    exponent = math.exp(math.pow((x-mean),2)/(-2*var))
    p = (1/math.sqrt(2*math.pi*var))*exponent
    return p


In [5]:
#计算每个属性的均值和方差
def summarizeAttribute(dataset):
    dataset = np.delete(dataset,-1,axis = 1) # delete label
    summaries = [(mean(attr),var(attr)) for attr in zip(*dataset)]
    return summaries

summary = summarizeAttribute(trainset)
summary


[(5.843809523809524, 0.6786391941391939),
 (3.038095238095238, 0.1675732600732602),
 (3.797142857142855, 3.078164835164835),
 (1.2209523809523808, 0.5859029304029305)]

In [6]:
#按类别提取属性特征，这里会得到 类别数目*属性数目 组 （均值，方差）
def summarizeByClass(dataset):
  dataset_separated,dataset_info = seprateByClass(dataset)
  summarize_by_class = {}
  for classValue, vector in dataset_separated.items():
      summarize_by_class[classValue] = summarizeAttribute(vector)
  return summarize_by_class

train_Summary_by_class = summarizeByClass(trainset)
train_Summary_by_class


{'virginica': [(6.527777777777778, 0.40492063492063485),
  (2.9555555555555553, 0.10825396825396827),
  (5.499999999999998, 0.30571428571428577),
  (2.030555555555555, 0.08732539682539685)],
 'setosa': [(4.988235294117646, 0.12955436720142602),
  (3.385294117647059, 0.12432263814616758),
  (1.447058823529412, 0.033475935828877),
  (0.24705882352941178, 0.012869875222816398)],
 'versicolor': [(5.9714285714285715, 0.28915966386554626),
  (2.785714285714285, 0.08714285714285715),
  (4.328571428571429, 0.18445378151260505),
  (1.334285714285714, 0.034084033613445384)]}

In [7]:
#按类别将每个属性的条件概率相乘。

#我们前面已经将训练数据集按类别分好，这里就可以实现，输入的测试数据依据每类的每个属性（就那个类别数*属性数的字典）计算属于某类的类条件概率。
def calculateClassProb(input_data,train_Summary_by_class):
  prob = {}
  for class_value, summary in train_Summary_by_class.items():
      prob[class_value] = 1
      for i in range(len(summary)):
          mean,var = summary[i]
          x = input_data[i]
          p = calculateProb(x,mean,var)
      prob[class_value] *=p
  return prob

input_vector = testset[1]
input_data = input_vector[:-1]
train_Summary_by_class = summarizeByClass(trainset)
class_prob = calculateClassProb(input_data,train_Summary_by_class)
class_prob


{'virginica': 6.277087462572083e-09,
 'setosa': 3.226696701399974,
 'versicolor': 1.3732848984171491e-08}

## 4、先验概率*类条件概率

In [8]:
#朴素贝叶斯分类器
def bayesianPredictOneSample(input_data):
  prior_prob = calulateClassPriorProb(trainset,train_info)
  train_Summary_by_class = summarizeByClass(trainset)
  classprob_dict = calculateClassProb(input_data,train_Summary_by_class)
  result = {}
  for class_value,class_prob in classprob_dict.items():
      p = class_prob*prior_prob[class_value]
      result[class_value] = p
  return max(result,key=result.get)

## 5、单个样本测试

In [9]:
print(testset)
input_vector = testset[2]
input_data = input_vector[:-1]
result = bayesianPredictOneSample(input_data)
print("the sameple is predicted to class: {0}.".format(result))


[[4.9, 2.4, 3.3, 1.0, 'versicolor'], [4.6, 3.2, 1.4, 0.2, 'setosa'], [4.8, 3.4, 1.6, 0.2, 'setosa'], [5.6, 2.9, 3.6, 1.3, 'versicolor'], [5.5, 2.3, 4.0, 1.3, 'versicolor'], [5.4, 3.7, 1.5, 0.2, 'setosa'], [4.8, 3.0, 1.4, 0.3, 'setosa'], [6.6, 3.0, 4.4, 1.4, 'versicolor'], [6.9, 3.1, 5.4, 2.1, 'virginica'], [5.8, 2.7, 4.1, 1.0, 'versicolor'], [6.4, 3.2, 4.5, 1.5, 'versicolor'], [6.1, 2.8, 4.0, 1.3, 'versicolor'], [4.4, 3.2, 1.3, 0.2, 'setosa'], [5.9, 3.2, 4.8, 1.8, 'versicolor'], [6.5, 3.0, 5.5, 1.8, 'virginica'], [5.5, 4.2, 1.4, 0.2, 'setosa'], [5.2, 4.1, 1.5, 0.1, 'setosa'], [5.0, 3.5, 1.3, 0.3, 'setosa'], [6.2, 2.2, 4.5, 1.5, 'versicolor'], [5.6, 2.7, 4.2, 1.3, 'versicolor'], [5.7, 2.6, 3.5, 1.0, 'versicolor'], [5.4, 3.9, 1.7, 0.4, 'setosa'], [4.8, 3.4, 1.9, 0.2, 'setosa'], [4.9, 3.0, 1.4, 0.2, 'setosa'], [6.3, 2.9, 5.6, 1.8, 'virginica'], [4.9, 3.1, 1.5, 0.2, 'setosa'], [5.1, 2.5, 3.0, 1.1, 'versicolor'], [7.3, 2.9, 6.3, 1.8, 'virginica'], [6.0, 3.4, 4.5, 1.6, 'versicolor'], [6.4, 2

## 6、分类准确率

In [10]:
def calculateAccByBeyesian(dataset):
  correct = 0
  for vector in dataset:
      input_data = vector[:-1]
      label = vector[-1]
      result = bayesianPredictOneSample(input_data)
      if result == label:
          correct+=1
  return correct/len(dataset)

acc = calculateAccByBeyesian(testset)
acc


0.9777777777777777

# Sk-learn库实现

In [11]:
from sklearn import naive_bayes # 导入朴素贝叶斯
from sklearn.model_selection import train_test_split

trainset,testset = train_test_split(iris,test_size=0.1)
trainX = trainset.drop('label', axis=1) # 提取数据
trainY = trainset['label'] # 提取标签
testX = testset.drop('label', axis=1)
testY = testset['label']


# 建立高斯分布的朴素贝叶斯模型
#clf=naive_bayes.GaussianNB()  #高斯分布，没有参数
# 建立多项式分布的朴素贝叶斯模型
clf=naive_bayes.MultinomialNB()  #多项式分布

# 训练模型
clf.fit(trainX,trainY)
# 输出准确率
print("训练准确率:" + str(clf.score(trainX,trainY)))
print ("测试准确率:" + str(clf.score(testX,testY)))


训练准确率:0.9481481481481482
测试准确率:1.0


# 离散型数据

## 1、数据集载入，划分训练集与测试集

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
#car = pd.read_csv("CarEvalution.csv")
iris = pd.read_csv('IrisData.csv')
iris_train,iris_test = train_test_split(iris,test_size=0.3)
print(iris_train)
print(len(iris_test))

     Sepal.Length  Sepal.Width  Petal.Length  Petal.Width       label
130           7.4          2.8           6.1          1.9   virginica
35            5.0          3.2           1.2          0.2      setosa
74            6.4          2.9           4.3          1.3  versicolor
66            5.6          3.0           4.5          1.5  versicolor
2             4.7          3.2           1.3          0.2      setosa
..            ...          ...           ...          ...         ...
51            6.4          3.2           4.5          1.5  versicolor
54            6.5          2.8           4.6          1.5  versicolor
43            5.0          3.5           1.6          0.6      setosa
112           6.8          3.0           5.5          2.1   virginica
6             4.6          3.4           1.4          0.3      setosa

[105 rows x 5 columns]
45


## 2、获取相关参数 getParams(data)

In [13]:
def getParams(data):
        ck_counts = data.label.value_counts()#训练样本中类为ck的数量集合
        ck_name = np.array(ck_counts.index)#训练样本中类ck名称集合    
        DataNum = len(data)#训练样本总数N
        CNum = len(ck_counts)#类的个数K
        DataSet = data
        return ck_counts,ck_name,DataNum,CNum
DataSet = iris_train
ck_counts,ck_name,DataNum,CNum = getParams(iris_train)
print(ck_counts,ck_name,DataNum,CNum)

versicolor    36
setosa        35
virginica     34
Name: label, dtype: int64 ['versicolor' 'setosa' 'virginica'] 105 3


## 3、计算先验概率

In [14]:
def calPriorProb(CNum,ck_counts,DataNum,lamda):
    ck_PriorProb = []
    for i in range(CNum):
        cx_PriorProb = (ck_counts[i]+lamda)/(DataNum+CNum*lamda)
        ck_PriorProb.append(cx_PriorProb)
    return ck_PriorProb   
ck_PriorProb = calPriorProb(CNum,ck_counts,DataNum,lamda=1)
print(calPriorProb(CNum,ck_counts,DataNum,lamda=1))

[0.3425925925925926, 0.3333333333333333, 0.32407407407407407]


## 4、计算类条件概率

In [15]:
def calCondProb(ck_name,DataSet,CNum,lamda):
    names = locals()#使用动态变量
    CondProb = []#存储所有类别的所有特征取值的条件概率
    feat_value = []#所有特征取值列表
        
        #对于每一类别的数据集
    for i in range(len(ck_name)):
        names['Q%s' % i] = DataSet[DataSet["label"]==ck_name[i]]#按类别划分数据集
        names['ConProbC%s' % i] = []#定义动态变量，表示各类别中所有特征取值的条件概率集合
        feature_arr = DataSet.columns.tolist()[0:len(DataSet.columns)-1]#获取训练数据集特征集
          
            #对于每一个特征求该特征各个取值的条件概率
        for feature in (feature_arr):
            names['Q%s' % feature]=[]#定义动态变量，表示某个类别的某个特征的所有取值条件概率
                
                #对于某个特征的所有可能取值求条件概率
            for value in DataSet[feature].value_counts().index.tolist():
                   
                    #生成所有特征取值列表
                if value not in feat_value:#如果这个取值不在列表中，则加入这个取值
                    feat_value.append(value)
#                     print(feat_value)
                    #这里用了拉普拉斯平滑，使得条件概率不会出现0的情况
                    #如果某个类的某个特征取值在训练集上都出现过，则这样计算
                if value in names['Q%s' % i][feature].value_counts():
                    temp = (names['Q%s' % i][feature].value_counts()[value]+lamda)/(names['Q%s' % i][feature].value_counts().sum()+len(names['Q%s' % i][feature].value_counts())*lamda)
                    #如果某个类的某个特征取值并未在训练集上出现，为了避免出现0的情况，分子取1(即lamda平滑因子，取1时为拉普拉斯平滑)
                else:
                    temp = lamda/(names['Q%s' % i][feature].value_counts().sum()+len(names['Q%s' % i][feature].value_counts())*lamda)    
                names['Q%s' % feature].append(temp)#将求得的特征取值条件概率加入列表
            names['ConProbC%s' % i].extend(names['Q%s' % feature])#将得到的某个类别的某个特征的所有取值条件概率列表加入某个类别中所有特征取值的条件概率集合
        CondProb.append(names['ConProbC%s' % i])#将某个类别中所有特征取值的条件概率集合加入所有类别所有特征取值的条件概率集合
#     CondProb.append(feat_value)#将所有特征取值列表也加入所有类别所有特征取值的条件概率集合(后面用来做columns--列索引)
    index = ck_name.tolist() #用类别名称的集合来生成行索引index
    
#     index.extend(['other'])#此处由于我最后一行是feat_value，后面会删掉，因此在行索引上也多加一个，后面删掉
    CondProb = pd.DataFrame(CondProb,index)#将所有类别所有特征取值的条件概率集合转换为DataFrame格式
#     CondProb.drop(['other'],inplace = True)
    return names,CondProb,feat_value
names,CondProb,feat_value = calCondProb(ck_name,DataSet,CNum,lamda=1)    
print(CondProb)


                 0         1         2         3         4         5    \
versicolor  0.037037  0.055556  0.092593  0.037037  0.037037  0.111111   
setosa      0.145833  0.020833  0.020833  0.104167  0.125000  0.020833   
virginica   0.018868  0.113208  0.056604  0.037736  0.018868  0.037736   

                 6         7         8         9    ...       106       107  \
versicolor  0.055556  0.055556  0.074074  0.055556  ...  0.022727  0.022727   
setosa      0.041667  0.020833  0.020833  0.020833  ...  0.024390  0.097561   
virginica   0.056604  0.075472  0.056604  0.056604  ...  0.086957  0.021739   

                 108       109       110       111       112       113  \
versicolor  0.068182  0.068182  0.022727  0.022727  0.022727  0.022727   
setosa      0.024390  0.024390  0.024390  0.024390  0.024390  0.048780   
virginica   0.021739  0.021739  0.065217  0.043478  0.043478  0.021739   

                 114       115  
versicolor  0.022727  0.022727  
setosa      0.024390  0

## 5、对一个样本进行预测  

In [16]:
def predict(traindata,testdata):
    getParams(traindata)#获取参数
    calPriorProb(CNum,ck_counts,DataNum,lamda=1)#获取先验概率
    calCondProb(ck_name,traindata,CNum,lamda=1)#获取条件概率
    
    ClassTotalProb = []#初始化各类别总概率列表
    bestprob = -1#初始化最高概率
    bestfeat = ''#初始化最可能类别
    
    for feat in ck_name:
        pp = ck_PriorProb[ck_name.tolist().index(feat)]#pp为先验概率
        cp = 1#初始化条件概率
        for value in feat_value:
            if value in testdata.index.tolist():
                cp = cp * CondProb[value][feat]#计算各特征取值的条件概率之积
               
        TotalProb = pp * cp#条件概率之积与先验概率相乘
       
        ClassTotalProb.append(TotalProb)
    #ClassTotalProb = pd.DataFrame(ClassTotalProb,ck_name)
    #print(ClassTotalProb)
    
        #找到最可能类别和其概率    
    for i in range(len(ck_name)):
        if ClassTotalProb[i] > bestprob:
            bestprob = ClassTotalProb[i]
            bestfeat = ck_name[i]
    return bestprob,bestfeat
bestprob,bestfeat = predict(iris_train,iris_test)
print(bestprob,bestfeat)
# #print(car_test)

0.3425925925925926 versicolor


## 6、计算预测准确度

In [17]:
def getAccuracy(traindata,testdata):
    num = 0
    realFeat = testdata.label.tolist()
    
    for i in range(len(testdata)):
        temp = testdata.iloc[i][0:len(testdata.columns)-1]    
        predProb,predFeat = predict(testdata,temp)
        print(predProb,predFeat,realFeat[i])
        if(realFeat[i] == predFeat):
            num = num + 1
    acc = num / float(len(realFeat))
    return acc
acc = getAccuracy(iris_train,iris_test)
print(acc)

0.3425925925925926 versicolor setosa
0.3425925925925926 versicolor versicolor
0.3425925925925926 versicolor virginica
0.3425925925925926 versicolor virginica
0.3425925925925926 versicolor virginica
0.3425925925925926 versicolor setosa
0.3425925925925926 versicolor setosa
0.3425925925925926 versicolor setosa
0.3425925925925926 versicolor virginica
0.3425925925925926 versicolor setosa
0.3425925925925926 versicolor versicolor
0.3425925925925926 versicolor versicolor
0.3425925925925926 versicolor setosa
0.3425925925925926 versicolor virginica
0.3425925925925926 versicolor virginica
0.3425925925925926 versicolor virginica
0.3425925925925926 versicolor setosa
0.3425925925925926 versicolor virginica
0.3425925925925926 versicolor versicolor
0.3425925925925926 versicolor virginica
0.3425925925925926 versicolor versicolor
0.3425925925925926 versicolor virginica
0.3425925925925926 versicolor versicolor
0.3425925925925926 versicolor versicolor
0.3425925925925926 versicolor virginica
0.342592592592