In [5]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from numpy import var
import math

In [6]:
# 加载鸢尾花数据集
iris = datasets.load_iris()
df = pd.DataFrame(iris.data)
df['label'] = iris.target
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']
# 加载鸢尾花数据集
x = iris.data
y = iris.target

# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)


In [9]:
class Model:
    def __init__(self):
        self.y= None
        self.classes=None
        self.classes_num=None
        self.parameters =[]

    def _calculate_prior(self, c):
        '''
        先验函数，也就是求先验概率
        利用极大似然估计的结果得到
        '''
        frequency = np.mean(self.y == c)
        return frequency
        # 贝叶斯估计的先验概率
        frequency = (np.sum(self.y == c) + 1) / (len(X) + self.classes_num)

    def _calculate_likelihood(self, mean, var, X):
        """
        似然函数
        """
        # 高斯概率
        eps = 1e-4 # 防止除数为0
        coeff = 1.0 / math.sqrt(2.0 * math.pi * var + eps)
        exponent = math.exp(-(math.pow(X - mean, 2) / (2 * var + eps)))
        return coeff * exponent

    def _calculate_probabilities(self, X):
        posteriors = [] # 后验概率
        for i,c in enumerate(self.classes):
            # p(y)
            posterior = self._calculate_prior(c)
            # p(x | y)
            for feature_value, params in zip(X, self.parameters[i]):
                # 独立性假设
                # P(x1,x2|Y) = P(x1|Y)*P(x2|Y)
                likelihood = self._calculate_likelihood(params["mean"], params["var"], feature_value)
                posterior *= likelihood
            posteriors.append(posterior)
        # 返回具有最大后验概率的类别
        return self.classes[np.argmax(posteriors)]

    def fit(self, train_data, train_label):
        self.y = train_label
        self.classes = np.unique(y) # 类别 
        self.classes_num = len(self.classes)
        # 计算每个特征针对每个类的均值和方差
        for i, c in enumerate(self.classes):
            # 选择类别为c的X
            X_where_c = train_data[np.where(self.y == c)]
            self.parameters.append([])
            # 添加均值与方差
            for col in X_where_c.T:
                parameters = {"mean": col.mean(), "var": col.var()}
                self.parameters[i].append(parameters)
        
    def predict(self, X):
        y_pred = [self._calculate_probabilities(sample) for sample in X]
        return y_pred

    def score(self, X, y):
        y_pred = self.predict(X)
        accuracy = np.sum(y == y_pred, axis=0) / len(y)
        return accuracy

In [10]:

model = Model()
model.fit(x_train, y_train)
# 测试数据
print(model.score(x_test, y_test))


0.9555555555555556
