# 朴素贝叶斯法

In [69]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math

## Load Data

In [70]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:100, :])
X, y = data[:, :4], data[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Gaussian Naive Bayes

特征的可能性被假设为高斯

概率密度函数：
$$P(x_i | y_k)=\frac{1}{\sqrt{2\pi\sigma^2_{yk}}}exp(-\frac{(x_i-\mu_{yk})^2}{2\sigma^2_{yk}})$$

数学期望(mean)：$\mu$

方差：$\sigma^2=\frac{\sum(X-\mu)^2}{N}$

In [71]:
class NaiveBayes:
    def __init__(self):
        self.model = None
        
    def gaussian_probability(self, x, mean, std):
        part_1 = 1 / np.sqrt(2 * math.pi * (std ** 2))
        part_2 = np.exp((-(x - mean) ** 2) / (2 * (std ** 2)))
        return part_1 * part_2
    
    def data_probability(self, train_data):
        summary = [(np.mean(i), np.std(i)) for i in zip(*train_data)]
        return summary
    
    def train(self, X_train, y_train):
        label = list(set(y_train))
        data = {i: [] for i in label}
        for x, label in zip(X_train, y_train):
            data[label].append(x)
        self.model = {
            label: [self.data_probability(label_data), len(label_data) / len(X_train)] for label, label_data in data.items()
        }
        return print('Naive Bays: Train Completed!')
    
    def predict(self, x):
        probability = {}
        for label, value in self.model.items():
            probability[label] = value[1]
            for i in range(len(value[0])):
                probability[label] *= self.gaussian_probability(x[i], value[0][i][0], value[0][i][1])
                
        label = sorted(probability.items(), key=(lambda x: x[1]))[-1][0]
        return label
    
    def score(self, X_test, y_test):
        accurate_count = 0
        for i in range(len(X_test)):
            if y_test[i] == self.predict(X_test[i]):
                accurate_count += 1
        return accurate_count / len(y_test)

In [72]:
model = NaiveBayes()
model.train(X_train, y_train)
model.score(X_test, y_test)

Naive Bays: Train Completed!


1.0

## sklearn

In [73]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

1.0

In [74]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB # 伯努利模型和多项式模型

参考代码: https://github.com/fengdu78/lihang-code

python: 3.7.6