# 贝叶斯分类器实践


In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

from fontTools.subset import subset
from scipy.stats import norm
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
class NaiveBayesClassifier:
    def __init__(self, laplace=True):
        self.class_priors = {}  # 先验概率 P(c)
        self.likelihood = {}    # 似然 P(x|c)
        self.feature_types = {} # 特征类型，连续或离散
        self.laplace = laplace

    def fit(self, X: pd.DataFrame, y: pd.Series):
        self.classes = np.unique(y)
        self.feature_types = {col: 'categorical' if X[col].dtype == 'object' else 'continuous' for col in X.columns}
        # P(c)
        class_counts = y.value_counts().to_dict()
        total_samples = len(y)
        self.class_priors = {c: class_counts[c] / total_samples for c in self.classes}
        # P(x|c)
        self.likelihood = {c: {} for c in self.classes}
        for c in self.classes:
            subset = X[y == c]
            for col in X.columns:
                if self.feature_types[col] == 'categorical':
                    # 计算 P(x=value|c)
                    value_counts = subset[col].value_counts().to_dict()
                    total_count = len(subset) + (len(X[col].unique()) if self.laplace else 0)
                    self.likelihood[c][col] = {val : (value_counts.get(val, 0) + (1 if self.laplace else 0)) / total_count for val in X[col].unique()}
                else:
                    # 计算正态分布参数
                    mean = subset[col].mean()
                    std = subset[col].std() if subset[col].std() > 0 else 1e-6
                    self.likelihood[c][col] = (mean, std)
        return self

    def predict(self, X: pd.DataFrame):
        y_pred = []
        for _, row in X.iterrows():
            class_probs = {}
            for c in self.classes:
                # P(c|x)
                prob = np.log(self.class_priors[c])
                for col in X.columns:
                    if self.feature_types[col] == 'categorical':
                        prob += np.log(self.likelihood[c][col].get(row[col], 1e-6))
                    else:
                        mean, std = self.likelihood[c][col]
                        prob += np.log(norm.pdf(row[col], loc=mean, scale=std))
                class_probs[c] = prob
            y_pred.append(max(class_probs, key=class_probs.get))
        return np.array(y_pred)

In [3]:
def load_dataset(name: str):
    """
    根据名称加载不同的数据集
    :param name: 数据集名称 ['watermelon', 'iris', 'adult', 'spam', 'wine']
    :return: X, y (DataFrame, Series)
    """
    if name == 'watermelon': # 经典西瓜数据集
        data = pd.DataFrame([
            ['青绿', '蜷缩', '浊响', '清晰', '好瓜'],
            ['乌黑', '蜷缩', '沉闷', '清晰', '好瓜'],
            ['乌黑', '蜷缩', '浊响', '清晰', '好瓜'],
            ['青绿', '蜷缩', '沉闷', '清晰', '好瓜'],
            ['浅白', '蜷缩', '浊响', '清晰', '好瓜'],
            ['青绿', '稍蜷', '浊响', '清晰', '好瓜'],
            ['乌黑', '稍蜷', '浊响', '稍糊', '好瓜'],
            ['乌黑', '稍蜷', '浊响', '清晰', '好瓜'],
            ['乌黑', '稍蜷', '沉闷', '稍糊', '坏瓜'],
            ['青绿', '硬挺', '清脆', '清晰', '坏瓜'],
            ['浅白', '硬挺', '清脆', '模糊', '坏瓜']
        ], columns=['色泽', '根蒂', '敲声', '纹理', '好瓜'])
        X, y = data.iloc[:, :-1], data.iloc[:, -1]
    elif name == 'iris': # 鸢尾花数据集
        iris = datasets.load_iris()
        X = pd.DataFrame(iris.data, columns=iris.feature_names)
        y = pd.Series(iris.target).astype(str)
    elif name == 'wine': # 葡萄酒质量数据集
        wine = datasets.load_wine()
        X = pd.DataFrame(wine.data, columns=wine.feature_names)
        y = pd.Series(wine.target).astype(str)
    elif name == 'adult': # Adult (收入预测)
        url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
        columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
        data = pd.read_csv(url, names=columns, na_values=" ?", skipinitialspace=True)
        data.dropna(inplace=True)
        X, y = data.iloc[:, :-1], data.iloc[:, -1]
    elif name == 'spam': # Spam (垃圾邮件分类)
        from sklearn.datasets import fetch_openml
        spam = fetch_openml(name="spambase", version=1)
        X = pd.DataFrame(spam.data)
        y = spam.target.astype(str)
    else:
        raise ValueError("未找到数据集，请选择 ['watermelon', 'iris', 'adult', 'spam', 'wine']")
    return X, y

In [4]:
def train_test_naive(name: str, laplace: bool):
    X, y = load_dataset(name)
    if X.select_dtypes(include=['object']).shape[1] > 0:
        X = X.apply(LabelEncoder().fit_transform)   # 对离散特征进行编码
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    nb = NaiveBayesClassifier(laplace=laplace).fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    print(f"=== {name} 数据集 ===")
    print(f"分类准确率: {accuracy_score(y_test, y_pred):.4f}")
    print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))
    print("分类报告:\n", classification_report(y_test, y_pred))

In [5]:
train_test_naive('wine', True)

=== wine 数据集 ===
分类准确率: 1.0000
混淆矩阵:
 [[19  0  0]
 [ 0 21  0]
 [ 0  0 14]]
分类报告:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        14

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54



In [6]:
train_test_naive('spam', True)

  prob += np.log(norm.pdf(row[col], loc=mean, scale=std))


=== spam 数据集 ===
分类准确率: 0.8146
混淆矩阵:
 [[583 221]
 [ 35 542]]
分类报告:
               precision    recall  f1-score   support

           0       0.94      0.73      0.82       804
           1       0.71      0.94      0.81       577

    accuracy                           0.81      1381
   macro avg       0.83      0.83      0.81      1381
weighted avg       0.85      0.81      0.82      1381



In [7]:
class AODEClassifier:
    def __init__(self, m=30):
        self.m = m
        self.feature_types = {}
        self.p_ci = {}  # P(c, x_i)
        self.p_jci = {} # P(x_j|c, x_i)

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.ns = [len(np.unique(X[col])) for col in X.columns]
        self.lengths = []
        self.feature_types = {col: 'categorical' if X[col].dtype == 'object' else 'continuous' for col in X.columns}
        self.p_jci = {c1: {c2 : 0 for c2 in self.classes} for c1 in self.classes}
        self.p_ci = {c: 0 for c in self.classes}
        for i, c in enumerate(self.classes):
            subset = X[y == c]
            self.lengths.append(len(subset))
            self.p_ci[c] = (len(subset) + 1) / (len(X) + len(self.classes) * self.ns[i])
            for j, cc in enumerate(self.classes):
                subsubset = subset[y == cc]
                self.p_jci[cc][c] = (len(subsubset) + 1) / (len(subset) + self.ns[j])
        return self

    def predict(self, X):
        y_pred = []
        for _, row in X.iterrows():
            class_probs = {}
            for i, c in enumerate(self.classes):
                if self.lengths[i] < self.m:
                    continue
                prob = np.log(self.p_ci[c])
                for j, cc in enumerate(self.classes):
                    prob += np.log(self.p_jci[cc][c])
                class_probs[c] = prob
            y_pred.append(max(class_probs, key=class_probs.get))
        return np.array(y_pred)

In [8]:
def train_test_AODE(name: str):
    X, y = load_dataset(name)
    if X.select_dtypes(include=['object']).shape[1] > 0:
        X = X.apply(LabelEncoder().fit_transform)   # 对离散特征进行编码
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    nb = AODEClassifier(m=1).fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    print(f"=== {name} 数据集 ===")
    print(f"分类准确率: {accuracy_score(y_test, y_pred):.4f}")
    print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))
    print("分类报告:\n", classification_report(y_test, y_pred))

In [9]:
train_test_AODE('watermelon')

=== watermelon 数据集 ===
分类准确率: 0.5000
混淆矩阵:
 [[0 2]
 [0 2]]
分类报告:
               precision    recall  f1-score   support

          坏瓜       0.00      0.00      0.00         2
          好瓜       0.50      1.00      0.67         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4



  subsubset = subset[y == cc]
  subsubset = subset[y == cc]
  subsubset = subset[y == cc]
  subsubset = subset[y == cc]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
train_test_naive('watermelon', True)

=== watermelon 数据集 ===
分类准确率: 0.5000
混淆矩阵:
 [[0 2]
 [0 2]]
分类报告:
               precision    recall  f1-score   support

          坏瓜       0.00      0.00      0.00         2
          好瓜       0.50      1.00      0.67         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4



  prob += np.log(norm.pdf(row[col], loc=mean, scale=std))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
