## 基本方法

训练数据集：

    X : feature vectors
    Y : labels
    
基本思想 maximum posterior:

    P(y=c_k | X=x) = (P(X=x|Y=c_k)P(Y=c_k))/(sum(P(X=x|Y=c_k)P(Y=c_k)))

y = f(x) = argmax P(y=c_k | X=x)

#### strong assumption

条件独立性假设：
    
    P(X=x|Y=c_k) = P(X_1 = x_1, X_2 = x_2, .... | Y=c_k)
 
        = P(X_1 = x_1 | Y=c_k) * P(X_2 = x_2 | Y=c_k) * ...

后验概率最大化： 风险期望最小化 R(f) = E[L(Y, f(X))]

## 参数估计

方法：最大近似然估计

    P(Y=c_k) = sum( I(y_i = c_k) )/N
    P(X_n = x_n | Y=c_k) = (sum(I(y_i(n) = x_n, y_i = c_k)))/(sum(I(y_i = c_k)))  第n个特征为x_n,分类为c_k; i为数据序号
    

In [1]:
import numpy as np
class naive_bayes:
    
    def __init__(self, X, Y, laplace_smoothing=1):
        self.X = X
        self.Y = Y
        self.laplace_smoothing = laplace_smoothing
        
        X_shape = X.shape
        
        # prior prob
        self.classes, counts = np.unique(Y, return_counts=True)
        self.prior = dict()
        self.cond_prob = dict()
        occurrence = dict(zip(self.classes, counts))
        for i in self.classes:
            self.prior[i] = self.__estimate_prior(i, occurrence, X_shape[0])
            
            # X instances of class Y = i
            index_class = np.where(Y == i)
            X_subclass = X[index_class]
            
            # cond prob
            self.cond_prob[i] = self.__estimate_conditional_prob(X_subclass)
        
        print (self.cond_prob)
        print (self.prior)
        
    def __estimate_prior(self, C, number_occurrence, total_instance):
        return (number_occurrence[C]+laplace_smoothing)/(total_instance+len(C)*laplace_smoothing)
        
    def __estimate_conditional_prob(self, X):
        cond_prob = dict()
        X_shape = X.shape
        for n in range(X_shape[1]):
            cond_prob[n] = dict()
            # i eme features of X
            features, counts = np.unique(X[:, n], return_counts=True)
            occurrence = dict(zip(features, counts))
            
            for k in features:
                cond_prob[n][k] = (occurrence[k]+laplace_smoothing)/(X_shape[0]+len(k)*laplace_smoothing) 
        
        return cond_prob
    
    def inference(self, x):
        prob = dict()
        for i in self.classes:
            prob[i] = self.__estimate(x, i)
            
        return self.__argmax(prob)
    
    def __estimate(self, x, c):
        prob = self.prior[c]
        for i in range(x.shape[0]):
            prob *= self.cond_prob[c][i][x[i]]
        return prob
    
    def __argmax(self, prob):
        c = None
        for i in prob.keys():
            c_new = prob[i]
            if c == None or c_new > c:
                c = c_new
                max_item = i
                
        return max_item, c

In [3]:

X = np.array([[1, "S"], [1, "M"], [1, "M"], [1, "S"], [1, "S"], [2, "S"], [2, "M"], [2, "M"], [2, "L"], [2, "L"], [3, "L"], [3, "M"], [3, "M"], [3, "L"], [3, "L"]])
Y = np.array([-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1])

classifier = naive_bayes(X, Y)

NameError: name 'laplace_smoothing' is not defined

In [23]:
print (classifier.inference(np.array([2, "S"])))

(-1, 0.066666666666666666)
