In [1]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../dataset/train.csv')
X, Y = train.drop('label', axis=1).to_numpy(), train['label'].to_numpy()
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.3)
test = pd.read_csv('../dataset/test.csv')
X_test = test.to_numpy()

给定数据集$$T={(x_1,y_1), (x_2,y_2),\cdots,(x_N,y_N)}$$
其中$x_i \in \mathbb{R}^n$，$y_i \in \{c_1, c_2, \cdots, c_K\}$

朴素贝叶斯法通过对训练集学习，将实例分到后验概率最大对类中$$y=\mathop{\arg\max_{c_k}}P(y=c_k|X=x)$$
根据条件概率公式以及全概率公式可得$$\mathop{\arg\max_{c_k}} P(y=c_k|X=x)=\mathop{\arg\max_{c_k}} \frac{P(X=x|y=c_k)P(y=c_k)}{\sum\limits_{k}P(X=x|Y=c_k)P(Y=c_k)}$$
分母对每一类都是相同的，所以可以将分母省略，朴素贝叶斯法通过对条件概率进行条件独立性假设得名，$$P(X=x|Y=c_k)=\prod_{j=1}^n P(X^{(j)}=x^{(j)}|Y=c_k)$$
在具体实现中，为防止概率溢出，通常将概率进行对数处理，故$$y=\mathop{\arg\max_{c_k}} \log P(y=c_k) + \sum\limits_j \log P(X^{(j)}=x^{(j)}|Y=c_k)$$

使用极大似然估计对先验概率以及条件概率进行学习，本例中采用多项式分布，则$$P(y=c_k)=\frac{\sum\limits_{i=1}^n I(y_i=c_k)}{N}$$
在学习条件概率时，为避免因训练集无法完全反应真实的数据分布而出现所要估计的概率为0的情况，通常对概率进行Laplace平滑，设第$j$个特征的取值集合为$\{a_{j1}, \cdots, a_{jS_j}\}$，则$$P(X^{(j)}=a_{jl}|Y=c_k)=\frac{\sum\limits_{i=1}^N I(x_i^{(j)}=a_{jl}, y_i=c_k)}{\sum\limits_{i=1}^N I(y_i=c_k)}$$

In [3]:
class NaiveBayes:
    def __init__(self):
        self.model = {}
    
    def count(self, x):
        return {
            feature: np.log(len(x[x == feature]) + 1) - np.log(len(x) + 256)
            for feature in range(256)
        }
    
    def fit(self, X, Y):
        X, Y = np.array(X), np.array(Y)
        labels = np.unique(Y)
        dim = X.shape[1]
        
        for label in labels:
            self.model[label] = {
                'feature_log_prior': [],
                'class_log_prior': np.log(len(Y[Y == label])) - np.log(len(Y))
            }
            
            for i in range(dim):
                self.model[label]['feature_log_prior'].append(
                    self.count(np.array([x[i] for x, y in zip(X, Y) if y == label])))
    
    def predict_one(self, x):
        pred, pred_prob = None, 0
        
        for label in self.model:
            prob = self.model[label]['class_log_prior']
            
            for i in range(len(x)):
                prob += self.model[label]['feature_log_prior'][i][x[i]]
                
            if (prob > pred_prob) or (pred_prob == 0):
                pred, pred_prob = label, prob
                
        return pred
    
    def predict(self, X):
        res = []
        
        for x in X:
            res.append(self.predict_one(x))
        return res

In [4]:
model = NaiveBayes()
model.fit(X_train, Y_train)

In [5]:
Y_pred = model.predict(X_val)
accuracy_score(Y_val, Y_pred)

0.8217460317460318

# sklearn

In [6]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
Y_pred = model.predict(X_val)
accuracy_score(Y_val, Y_pred)

0.8311111111111111