In [1]:
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import random

## 初始化数据集

In [2]:
def initDataset():
    # 导入鸢尾花数据集
    iris = load_iris()
    # 提取出特征值及目标值
    X = iris.data
    y = iris.target
    # 将数据集转化成DataFrame好进行后续处理
    dataset = pd.DataFrame(X)
    dataset["target"] = y
    return dataset

In [3]:
dataset = initDataset()
dataset.head()

Unnamed: 0,0,1,2,3,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
def plotDataset(dataset):
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]
    

## 随机切分数据集

In [5]:
def randSplit(dataset, rate):
    # 获取数据集长度
    n = dataset.shape[0]
    # 计算切分比例
    m = int(n * rate)
    # 将数据集进行打乱
    l = list(dataset.index)
    random.shuffle(l)
    dataset.index = l
    # 切分训练集与测试集
    train = dataset.loc[range(m), :]
    test = dataset.loc[range(m, n), :]
    # 重置数据集与测试集下标
    dataset.index = range(dataset.shape[0])
    test.index = range(test.shape[0])
    return train, test

In [6]:
train, test = randSplit(dataset, 0.7)

In [7]:
train.head()

Unnamed: 0,0,1,2,3,target
0,6.4,3.2,5.3,2.3,2
1,5.1,3.5,1.4,0.3,0
2,6.1,2.9,4.7,1.4,1
3,5.8,2.7,5.1,1.9,2
4,4.3,3.0,1.1,0.1,0


In [8]:
test.head()

Unnamed: 0,0,1,2,3,target
0,6.7,3.1,4.7,1.5,1
1,6.1,3.0,4.6,1.4,1
2,4.9,3.6,1.4,0.1,0
3,6.5,3.0,5.8,2.2,2
4,6.0,2.7,5.1,1.6,1


## 高斯朴素贝叶斯

In [9]:
def gs_classif(train, test):
    # 获取训练集内的所有样本类别
    labels = train.iloc[:, -1].value_counts().index
    mean = []
    std = []
    result = []
    for i in labels:
        # 获取所当前对应样本类别的所有样本数据
        item = train.loc[train.iloc[:, -1] == i, :]
        # 求该样本的均值和方差
        m = item.iloc[:, :-1].mean()
        s = np.sum(np.power((item.iloc[:, :-1] - m), 2)) / item.shape[0]
        # 将该类别的样本的均值和方差追加进列表中
        mean.append(m)
        std.append(s)
    # 将均值及方差索引变成相对应的类名
    means = pd.DataFrame(mean, index=labels)
    stds = pd.DataFrame(std, index=labels)
    # 高斯朴素贝叶斯预测
    for j in range(test.shape[0]):
        # 逐个获取样本
        iset = test.iloc[j, :-1].tolist()
        # 正态分布公式对当前样本的每个特征值进行预测
        iprob = np.exp(-np.power((iset - means), 2) / (2 * stds)) / np.sqrt(2 * np.pi * stds)
        # 初始化当前样本概率值
        prob = 1
        # 将当前样本的所有特征值的概率值相乘，得到该样本的总概率值
        for k in range(test.shape[1] - 1):
            prob *= iprob[k]
            # 获取该样本最大概率的类别
            cla = prob.index[np.argmax(prob.values)]
        result.append(cla)
    test["predict"] = result
    # 预测准确率
    acc = (test.iloc[:, -1] == test.iloc[:, -2]).mean()
    return test, acc

In [10]:
test, acc = gs_classif(train, test)

In [11]:
acc

0.9333333333333333

## 改变测试集类名

In [15]:
def changeClassName(test):
    # 获取测试集的目标值列
    t = test.iloc[:, -2:-1].values
    # 转换成列表好后续替换操作
    t = t.flatten().tolist()
    p = test.iloc[:, -1].values
    p = p.flatten().tolist()
    for i in range(len(t)):
        if t[i] == 0:
            t[i] = "Iris-Setosa"
        elif t[i] == 1:
            t[i] = "Iris-Versicolour"
        elif t[i] == 2:
            t[i] = "Iris-Virginica"
    for i in range(len(p)):
        if p[i] == 0:
            p[i] = "Iris-Setosa"
        elif p[i] == 1:
            p[i] = "Iris-Versicolour"
        elif p[i] == 2:
            p[i] = "Iris-Virginica"
    test["target"] = t
    test["predict"] = p
    return test

In [16]:
test = changeClassName(test)

In [17]:
test

Unnamed: 0,0,1,2,3,target,predict
0,6.7,3.1,4.7,1.5,Iris-Versicolour,Iris-Versicolour
1,6.1,3.0,4.6,1.4,Iris-Versicolour,Iris-Versicolour
2,4.9,3.6,1.4,0.1,Iris-Setosa,Iris-Setosa
3,6.5,3.0,5.8,2.2,Iris-Virginica,Iris-Virginica
4,6.0,2.7,5.1,1.6,Iris-Versicolour,Iris-Versicolour
5,6.3,3.3,6.0,2.5,Iris-Virginica,Iris-Virginica
6,4.6,3.2,1.4,0.2,Iris-Setosa,Iris-Setosa
7,6.3,2.5,5.0,1.9,Iris-Virginica,Iris-Virginica
8,5.4,3.9,1.3,0.4,Iris-Setosa,Iris-Setosa
9,7.1,3.0,5.9,2.1,Iris-Virginica,Iris-Virginica
