In [39]:
import numpy as np
import pandas as pd
import random

#导入数据
dataSet = pd.read_csv("iris.txt",header = None)

In [40]:
# 数据集的前5行
dataSet.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
# 不加header = None的效果
dataSet.head()

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [41]:
'''
函数功能：随机切分训练集和测试集
参数说明：
    dataSet: 输入的数据集
    rate：训练集所占比例
返回：
    train，test：切分好的训练集和测试集
'''
def randSplit(dataSet,rate):
    l = list(dataSet.index) #提取出索引
    random.shuffle(l) # 随机打乱索引
    dataSet.index = l # 将打乱后的索引重新赋值给原数据集
    n = dataSet.shape[0] # 总行数
    m = int(n * rate) # 训练集的数量
    #print(dataSet)
    train = dataSet.loc[range(m),:] # 提取前m个记录作为训练集
    #print(train)
    test = dataSet.loc[range(m,n),:] # 剩下的作为测试集
    dataSet.index = range(dataSet.shape[0]) # 更新原始数据集的索引
    test.index = range(test.shape[0]) # 更新测试集的索引
    return train , test

In [47]:
train,test = randSplit(dataSet,0.8)

In [69]:
'''
函数功能：构建高斯朴素贝叶斯分类器
参数说明:
    train:训练集
    test: 测试集
返回：
    test: 最终预测结果
'''
def gnb_classify(train,test):
    labels = train.iloc[:,-1].value_counts().index # 提取训练集的标签种类
    #print(train.iloc[:,-1].value_counts()) --->Iris-virginica     43
                                            #   Iris-versicolor    41
                                           #    Iris-setosa        36
    # print(labels)--->Index(['Iris-setosa', 'Iris-virginica', 'Iris-versicolor'], dtype='object')
    mean = [] # 存放每个类别的均值
    std = [] # 存放每个类别的方差
    result = [] # 存放每测试集的预测结果
    for i in labels:
        item = train.loc[train.iloc[:,-1] == i ,:] # 分别提取出每一种类别
        #print(item)
        m = item.iloc[:,: -1].mean() # 当前类别的平均值
        #print(m) --->求每一列的均值
        s = np.sum((item.iloc[:,: -1] - m)**2)/(item.shape[0]) # 当前类别的方差
        mean.append(m) # 将当前类别的平均值追加到列表
        std.append(s) # 将当前类别的方差追加到列表
    means = pd.DataFrame(mean,index = labels)
    #print(means)
    stds = pd.DataFrame(std , index = labels) # 变成DF格式，索引为类标签
    for j in range(test.shape[0]): 
        iset = test.iloc[j,: -1].tolist()  # 当前测试样例
        iprob = np.exp(-1 * (iset - means)**2/(stds*2))/(np.sqrt(2*np.pi*stds)) # 正态分布公式
        #print(iprob)
        prob = 1 # 初始化当前实例总概率
        for k in range(test.shape[1] - 1): # 遍历每个特征
            prob*= iprob[k] # 特征概率之和即为当前实例概率
            cla = prob.index[np.argmax(prob.values)] # 返回最大概率类别
        result.append(cla)
    test['predict'] = result
    acc = (test.iloc[:,-1] == test.iloc[:,-2]).mean() # 计算预测准确率
    print("模型预测准确率{%f}"%acc)
    return test

In [70]:
train,test = randSplit(dataSet,0.8)
gnb_classify(train,test)

模型预测准确率{1.000000}


Unnamed: 0,0,1,2,3,4,predict
0,6.2,2.8,4.8,1.8,Iris-virginica,Iris-virginica
1,5.6,2.9,3.6,1.3,Iris-versicolor,Iris-versicolor
2,5.2,4.1,1.5,0.1,Iris-setosa,Iris-setosa
3,6.7,2.5,5.8,1.8,Iris-virginica,Iris-virginica
4,5.4,3.4,1.7,0.2,Iris-setosa,Iris-setosa
5,5.8,2.7,4.1,1.0,Iris-versicolor,Iris-versicolor
6,5.4,3.9,1.3,0.4,Iris-setosa,Iris-setosa
7,7.2,3.2,6.0,1.8,Iris-virginica,Iris-virginica
8,5.7,2.6,3.5,1.0,Iris-versicolor,Iris-versicolor
9,5.5,3.5,1.3,0.2,Iris-setosa,Iris-setosa


In [71]:
for i in range(20):
    train,test= randSplit(dataSet, 0.8)
    gnb_classify(train,test)



模型预测准确率{1.000000}
模型预测准确率{0.966667}
模型预测准确率{0.933333}
模型预测准确率{1.000000}
模型预测准确率{0.966667}
模型预测准确率{1.000000}
模型预测准确率{1.000000}
模型预测准确率{1.000000}
模型预测准确率{0.866667}
模型预测准确率{1.000000}
模型预测准确率{0.933333}
模型预测准确率{0.900000}
模型预测准确率{0.966667}
模型预测准确率{0.966667}
模型预测准确率{0.933333}
模型预测准确率{0.933333}
模型预测准确率{0.966667}
模型预测准确率{0.966667}
模型预测准确率{1.000000}
模型预测准确率{0.866667}


In [None]:
if __name__ == '__main__':
    for i in range(20):
        #导入数据
        dataSet = pd.read_csv("iris.txt",header = None)
        train,test= randSplit(dataSet, 0.8)
        gnb_classify(train,test)
