# 朴素贝叶斯 

## 训练
- 采用积极学习
- 训练过程实际上就是构建一个三级字典，存储对应的先验概率

In [1]:
import pandas as pd
import numpy as np

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer, load_iris, load_wine
from sklearn.metrics import accuracy_score

In [3]:
from __future__ import division
from sklearn.preprocessing import LabelEncoder

In [31]:
class NaiveBayes(object):
    # 为了方便处理离散数据数据的不同属性值，分别进行编码
    def __init__(self, alpha=1):
        self.xEncoders = []
        self.yEncoder = None
        self.is_dispersed = None
        self.alpha = alpha  # 平滑系数，=1,拉普拉斯平滑， =0 ，无平滑， 0<alpha<1,其他平滑
        # 用来保存先验概率与似然的字典结构
        self.y_labels_count = None
        self.x_labels_count = None

    def __statistic(self, data, is_dispersed=True):
        res = {}
        if is_dispersed is True:  # 数据是离散的
            labels_count = np.bincount(data.astype(np.int))
            labels = np.nonzero(labels_count)[0]
            for label in labels:
                res[label] = labels_count[label]
            res['sum'] = data.shape[0]
            res['labels'] = len(labels)
        else:  # 数据是连续的
            res['mean'] = np.mean(data)
            res['std'] = np.std(data)
        return res

    def train(self, train_x, train_y, is_dispersed=None):
        self.is_dispersed = is_dispersed  # 判断属性是否是离散的，方便编码与计算先验概率
        m, n = train_x.shape
        for i in range(n):
            if self.is_dispersed[i] is True:
                cur_le = LabelEncoder()
                cur_le.fit(train_x[:, i])
                train_x[:, i] = cur_le.transform(train_x[:, i])
                self.xEncoders.append(cur_le)
            else:
                self.xEncoders.append(None)
        cur_le = LabelEncoder()
        cur_le.fit(train_y)
        train_y = cur_le.transform(train_y)
        self.yEncoder = cur_le

        # y标签的每种属性值的value_count
        y_labels_count = self.__statistic(train_y, is_dispersed=True)
        x_labels_count = {}
        for key in y_labels_count.keys()[:-2]:
            x_labels_count[key] = {}
            cur_train_x = train_x[train_y == key]
            for i in range(n):
                x_labels_count[key][i] = self.__statistic(cur_train_x[:, i],
                                                          self.is_dispersed[i])
        self.y_labels_count = y_labels_count
        self.x_labels_count = x_labels_count

    def fit(self, train_x, train_y, is_dispersed=None):
        self.train(train_x, train_y, is_dispersed)

    def __calProbability(self, labels_count_dic, labels_value, is_dispersed=True):
        if is_dispersed is True:  # 离散属性，计算概率
            if labels_value not in labels_count_dic:
                return self.alpha / (labels_count_dic['sum'] +
                                     (len(labels_count_dic) - 1) * self.alpha)
            else:
                return (labels_count_dic[labels_value] +
                        self.alpha) / (labels_count_dic['sum'] +
                                       (len(labels_count_dic) - 1) * self.alpha)
        else:  # 连续属性属性，计算概率
            cur_mean = labels_count_dic['mean']
            cur_std = labels_count_dic['std']
            return (1 / (np.sqrt(2 * np.pi) * cur_std)) * \
                   np.exp(-np.square(labels_value - cur_mean) / (2 * cur_std ** 2))

    def predict(self, x):
        m, n = x.shape
        x = np.copy(x)
        pre_y = np.zeros(m)
        for i in range(n):
            if self.xEncoders[i] is not None:
                x[:, i] = self.xEncoders[i].transform(x[:, i])
        # 比较后验概率的相对大小
        for i in range(m):
            max_post_probability = 0
            pre_label = -1
            for y_label in self.y_labels_count.keys()[:-2]:
                y_label_pro = self.__calProbability(self.y_labels_count, y_label)
                temp_post_probability = 1
                temp_post_probability *= y_label_pro
                for j in range(n):
                    temp_post_probability *= self.__calProbability(
                        self.x_labels_count[y_label][j], x[i][j],
                        is_dispersed=self.is_dispersed[j])
                if temp_post_probability > max_post_probability:
                    max_post_probability = temp_post_probability
                    pre_label = y_label
            pre_y[i] = pre_label
        return self.yEncoder.transform(pre_y)

In [26]:
def load_data(path = '../data/heart.csv'):
    heart_data = pd.read_csv(path).values
    y = heart_data[:, -1]
    x = heart_data[:, :-1]
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2)
    assert train_x.shape[1] == test_x.shape[1]
    return train_x, train_y, test_x, test_y

In [27]:
train_x, train_y, test_x, test_y = load_data()

In [28]:
train_x[:3]

array([[  4.10000000e+01,   0.00000000e+00,   1.00000000e+00,
          1.26000000e+02,   3.06000000e+02,   0.00000000e+00,
          1.00000000e+00,   1.63000000e+02,   0.00000000e+00,
          0.00000000e+00,   2.00000000e+00,   0.00000000e+00,
          2.00000000e+00],
       [  4.80000000e+01,   1.00000000e+00,   1.00000000e+00,
          1.30000000e+02,   2.45000000e+02,   0.00000000e+00,
          0.00000000e+00,   1.80000000e+02,   0.00000000e+00,
          2.00000000e-01,   1.00000000e+00,   0.00000000e+00,
          2.00000000e+00],
       [  5.40000000e+01,   1.00000000e+00,   1.00000000e+00,
          1.08000000e+02,   3.09000000e+02,   0.00000000e+00,
          1.00000000e+00,   1.56000000e+02,   0.00000000e+00,
          0.00000000e+00,   2.00000000e+00,   0.00000000e+00,
          3.00000000e+00]])

In [32]:
def naivebayesPre(train_x, train_y, test_x, test_y, model):
    is_dispersed = [False, True, False, False, False, True, False, False, 
                    True, False, False, False, True]
    model.fit(train_x, train_y, is_dispersed)
    pre_y = model.predict(test_x)
    print 'accuracy: %f' % (accuracy_score(test_y, pre_y))

In [33]:
naivebayesPre(train_x, train_y, test_x, test_y, NaiveBayes())

accuracy: 0.852459
