In [5]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
# 导入数据
train_data = pd.read_csv('traindd.CSV',encoding='utf-8',usecols=[0,2,3,4,5])
test_data = pd.read_csv('testdd.CSV',encoding='utf-8',usecols=[1,2,3,4])
# 转换类别变量
# pd.get_dummies 转换成多列矩阵
# pd.factorize 转换成单列数值
train_data['Category'] = pd.factorize(train_data['develop'])[0]
train_data = train_data.loc[:,'GDP':'Category']
# 特征矩阵和目标向量
X_train = train_data.drop('Category', axis=1).values
y_train = train_data['Category'].values
X_test = test_data.values

In [None]:
# 计算每个类别的统计量
classes = np.unique(y_train)
n_classes = len(classes)
n_features = X_train.shape[1]

array([0, 1, 2])

In [15]:
# 存储每个类别的均值、协方差矩阵和先验概率
means = {}
covariances = {}
priors = {}

for c in classes:
    X_c = X_train[y_train == c]
    means[c] = np.mean(X_c, axis=0)
    covariances[c] = np.cov(X_c, rowvar=False)
    priors[c] = X_c.shape[0] / X_train.shape[0]

In [16]:
# 定义贝叶斯判别函数
def bayes_predict(X):
    log_posteriors = []
    for c in classes:
        # 计算对数似然（忽略常数项）
        log_likelihood = multivariate_normal.logpdf(X, mean=means[c], cov=covariances[c])
        log_prior = np.log(priors[c])
        log_posterior = log_likelihood + log_prior
        log_posteriors.append(log_posterior)
    return classes[np.argmax(log_posteriors, axis=0)]

In [17]:
# 预测测试集
y_pred = np.array([bayes_predict(x) for x in X_test])
y_pred

array([0, 2, 1, 1])