In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math


In [None]:
'''
实现中的问题：
１．　重在理解朴素贝叶斯法训练时要计算的先验概率和类条件概率
２．　这里实现的是高斯贝叶斯，即假设在类确定的情况下，特征值的分布服从高斯分布，伯努利贝叶斯，多项式贝叶斯同理。
那么只需要知道该类别下期望和方差，就可以求出某个实例ｘ在该类别下的概率

'''


In [15]:

#load data,这里四个特征都用上了，与　感知机和ｋｎｎ有所不同
data=load_iris()
# print(data)
df=pd.DataFrame(data.data,columns=data.feature_names)
df['label']=data.target
df.columns=['sl','sw','pl','pw','label']

# X,y
X=np.array(df.iloc[:100,:-1])
y=np.array(df.iloc[:100,-1])

# y=np.array([-1 if i==0 else 1 for i in y  ])

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [24]:
class GaussianNaiveBayes():
    def __init__(self):
        # 存储每个类别下每个特征的均值和方差
        self.model={}
        #存储先验概率
        self.prior={}
        
    def gaussian_prob(self,x,u,alpha):
        exponent=math.exp(-(math.pow(x-u,2))/(2*math.pow(alpha,2)))
        return (1/(alpha*math.sqrt(2*math.pi))*exponent)
    
    def mean(self,X):
        return sum(X)/float(len(X))
    
    def stdev(self,X):
        avg=self.mean(X)
        return math.sqrt(sum([math.pow(i-avg,2) for i in X])/float(len(X)))
    
    def fit(self,X_train,y_train):
        #统计每个类别下的样本
        labels=list(set(y_train))
        data={i:[] for i in labels}
        for i in range(len(X_train)):
            data[y_train[i]].append(X_train[i])

        #计算每个类别下每个特征的均值和方差
        for k,v in data.items():
            self.model[k]=[(self.mean(i),self.stdev(i)) for i in zip(*v)]
        
        # 计算先验概率
        c=Counter(y_train)
        yl=len(y_train)
        for k,v in c.items():
            # self.prior[k]=v/yl
            self.prior[k]=1
            
    def cal_prob(self,xp):
        all_prob={}
        for k,v in self.prior.items():
            all_prob[k]=v
            for i,(mean,stdev) in enumerate(self.model[k]):
                all_prob[k]*=self.gaussian_prob(xp[i],mean,stdev)
        return all_prob
    
    def predict(self,xp):
        
        return sorted(self.cal_prob(xp).items(),key=lambda a:a[1],reverse=True)[0][0]
    
    def score(self,X_test,y_test):
        acc_ct=0
        for i in range(len(X_test)):
            if self.predict(X_test[i])==y_test[i]:
                acc_ct+=1
        return acc_ct/float(len(X_test))
                

In [25]:
nb=GaussianNaiveBayes()
nb.fit(X_train,y_train)
print('test point:{}'.format(nb.predict([4.4,  3.2,  1.3,  0.2])))
print('test acc score:{}'.format(nb.score(X_test,y_test)))

test point:0
test acc score:1.0


In [26]:
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
nb.fit(X_train,y_train)
print('sklearn test point:{}'.format(nb.predict([[4.4,  3.2,  1.3,  0.2]])))
print('sklearn acc score:{}'.format(nb.score(X_test,y_test)))


sklearn test point:[0]
sklearn acc score:1.0
