In [6]:
# coding:utf-8
from collections import defaultdict
import copy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets

%matplotlib inline
sns.set_style({'font.sans-serif':['simhei','Arial']})
sns.set_style('white')

## 朴素贝叶斯
称为朴素贝叶斯是由于它假设了各特征之间的独立性（即概率可以进行相乘）

In [2]:
class NaiveBayes(object):
    """
    Parameters
    ----------
    alpha: float, optional, default 1.0
        Laplace smoothing lambda.
    """
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        
    def fit(self, X, y):
        self.X = np.array(X)
        self.y = np.array(y)
        
        # 计算P(Y=c_k) 每一个分类的概率
        self.class_count = pd.Series(self.y).value_counts()
        self.y_class = pd.Series(self.class_count.index.tolist(), index=self.class_count.index)
        self.p_y = (self.class_count + self.alpha) / (self.y.shape[0] + self.alpha * len(self.y_class))
        
        # 计算P(X-j=a_ji|Y=c_k) 在y为某一类的情况下，x-j为某个值的概率
        # p_x_y[y_class][x_feature_index][x_feature_value]
        self.x_class = [np.unique(x) for x in self.X.T]
        zeros_list = [{x_v:0 for x_v in xi_class} for xi_class in self.x_class]
        self.p_x_y = defaultdict(lambda : copy.deepcopy(zeros_list))
        # 统计每个X-j=a_ji|Y=c_k的数量
        for j in range(X.shape[0]): # 第j条数据
            for i in range(X.shape[1]): # 第i个特征
                self.p_x_y[self.y[j]][i][self.X[j, i]] += 1
        # 除以y_class_count
        for cls in self.class_count.index:
            cls_count = self.class_count[cls]
            for i, p_x_dict in enumerate(self.p_x_y[cls]):
                x_cls_count = len(self.x_class[i])
                for p_x_value in p_x_dict:
                    p_x_dict[p_x_value] = (p_x_dict[p_x_value] + self.alpha) / (cls_count + self.alpha * x_cls_count)
        
    def _get_proba(self, xj):
        p_x_y_prod = lambda cls: np.array([p_x_dict.get(xj[i], 0) for i, p_x_dict in enumerate(self.p_x_y[cls])]).prod()
        y_proba = self.y_class.map(lambda cls: self.p_y[cls] * p_x_y_prod(cls))
        return y_proba
        
    def predict(self, X_pred):
        y_pred = np.array([self._get_proba(xj).argmax() for xj in X_pred])
        return y_pred


* 测试学习方法

In [3]:
dataset = np.array([
    [1,1,-1],
    [1,2,-1],
    [1,2,1],
    [1,1,1],
    [1,1,-1],
    [2,1,-1],
    [2,2,-1],
    [2,2,1],
    [2,3,1],
    [2,3,1],
    [3,3,1],
    [3,2,1],
    [3,2,1],
    [3,3,1],
    [3,3,-1],
])
X_train = dataset[:, :-1]
y_train = dataset[:, -1]

In [4]:
model = NaiveBayes()
model.fit(X_train, y_train)
model.predict([[11,1], [2,1]])

array([ 1, -1])