In [10]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [11]:
iris = load_iris()
x = iris.data
y = iris.target

x_trn, x_val, y_trn, y_val = train_test_split(x, y, test_size=0.2, random_state=21)

In [12]:
# 先验概率
def calculate_prior(y):
    classes, cnt = np.unique(y, return_counts=True)
    prior = cnt / y.shape[0]
    return classes, prior

# 每个类别下特征的均值和标准差
def calculate_statistics(x,y):
    classes = np.unique(y)
    mean = np.zeros((len(classes), x.shape[1]), dtype=np.float64)
    std = np.zeros((len(classes), x.shape[1]), dtype=np.float64)
    
    for idx, cls in enumerate(classes):
        x_cls = x[y==cls]
        mean[idx, : ] = x_cls.mean(axis=0)
        std[idx, : ] = x_cls.std(axis=0)
    
    return mean, std

# 每个类别下的特征似然
def calculate_likelihood(mean, std, x):
    e = np.exp(-((x - mean) ** 2 / (2 * std**2)))
    likelihood = (1 / (np.sqrt(2 * np.pi) * std)) * e
    return likelihood

# 预测（取对处理）
def predict(X, classes, prior, mean, std):
    ret = []
    for x in X:
        pos = np.log(prior)
        for idx, cls in enumerate(classes):
            likelihood = calculate_likelihood(mean[idx], std[idx], x)
            pos[idx] += np.sum(np.log(likelihood))
        ret.append(classes[np.argmax(pos)])
    return ret

In [13]:
# train
classes, prior = calculate_prior(y_trn)
mean, std = calculate_statistics(x_trn, y_trn)

# predict 
y_pred = predict(x_val, classes, prior, mean, std)
acc = np.mean(y_pred == y_val)
print(f"The accuracy: {acc}")

The accuracy: 0.9666666666666667


In [16]:
import pandas as pd

result = list(iris.target_names[y] for y in y_pred)
res = pd.DataFrame({'data_val':list(x_val), 'target_pred':result})
res.to_csv('results.csv', index=False)