In [74]:
from collections import Counter

import numpy as np
import pandas as pd
from pandas import DataFrame,Series
from IPython.display import display

In [79]:
class information:
    #每一类样本数、总样本数、类别数、先验概率、特征统计信息
    def __init__(self,class_sample_num, sample_num, class_num, prior_probability, feature_probability):
        self.class_sample_num = class_sample_num
        self.sample_num = sample_num
        self.class_num = class_num
        self.prior_probability = prior_probability
        self.feature_probability = feature_probability

In [75]:
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec
#postingList = loadDataSet()[0]

In [76]:
#去除所有重复词，并且组成一个向量
def createFeature(dataset):
    temp = set()
    map(lambda s: temp.update(s), dataset)
    return list(temp)
#list_feature = createFeature(postingList)

#将数据集中的每个sample都表示成特征向量的形式，sample中的词对应特征向量的中为1的位置
def wordVector(dataset,list_feature): 
    len_dataset = len(dataset)
    len_features = len(list_feature)
    #初始化sample的特征向量
    vector_sample = np.zeros((len_dataset, len_features))                
    for i, sample in enumerate(dataset):
        for word in sample:
            index = list_feature.index(word)
            vector_sample[i][index] += 1
    return vector_sample        

#wordVector(postingList, list_feature)        

In [78]:
#统计每个类，每一个特征中，所有可能取值的个数返回{类别：{特征{取值:}}}
def statistics(X_data, y_data):
    y_data = y_data.iloc[:,0]
    info = information(dict, 0, 0, 0, dict())
     
    #存储每个特征不同取值的条件概率
    feature_probability = dict()
    #样本总数
    total_sample = y_data.shape[0]
    
    #一个Series，index是类别名，value是该类的样本数目
    class_info = y_data.value_counts()    
    #label的名字，也就是index
    labels = list(class_info.index)
    #数据集中出现的类别数
    labels_counts = class_info.shape[0]
    #计算先验概率,用拉普拉斯平滑一下,Series的数据格式
    prior_probability = 1.0 * (class_info + 1) / (total_sample + labels_counts)
    #存储每类样本数
    class_sample_num = dict()
    for label in labels:
        label_x_data = X_data[y_data == label]         
        class_sample_num[label] = label_x_data.shape[0]                
        feature_probability[label] = dict()
        for col in label_x_data:
            feature_probability[label][col] = {}
            #每个特征的可能取值
            feature_probability[label][col].update(dict(label_x_data[col].value_counts()))
    #存储每类样本数
    info.class_sample_num = class_sample_num
    #存储样本总数
    info.sample_num = total_sample
    #存储类别数
    info.class_num = labels_counts
    #存数先验概率
    info.prior_probability = prior_probability
    #存储特征的统计学习
    info.feature_probability = feature_probability
    
    return info
#statistics(X_data, y_data)    

In [99]:
'''计算给定样本的朴素贝叶斯分类器的值，
之后只要通过查表（probability）就能立马得出每一类的概率，
取最大概率的类标签为要预测或分类sample的类'''
def calcProbability(X_data, y_data):    
    info = statistics(X_data, y_data)
    feature_info = info.feature_probability
    probability = {}
    for label, value in feature_info.items():
        df = DataFrame(value)        
        temp_df = df.copy()
        df.fillna(0,inplace =True)
        #每类样本数
        class_sample_num = info.class_sample_num[label]  
        for col in df:
            #每个特征的可能取值数
            feature_value_unique_num = len(temp_df[col].dropna())
            df[col] = (df[col] + 1) / (class_sample_num + feature_value_unique_num)
            probability[label] = df  
        
    return probability                 
calcProbability(X_data.iloc[0], X_data, y_data)

{0:       0    1    2     3     4    5    6    7     8     9   ...    22    23  \
 0.0  0.6  0.6  0.6  1.00  1.00  0.6  0.6  0.6  1.00  1.00  ...   0.4  1.00   
 1.0  0.4  0.4  0.4  0.25  0.25  0.4  0.4  0.4  0.25  0.25  ...   0.6  0.25   
 
       24    25   26    27   28   29   30    31  
 0.0  0.6  1.00  0.6  1.00  0.6  0.6  0.6  0.25  
 1.0  0.4  0.25  0.4  0.25  0.4  0.4  0.4  1.00  
 
 [2 rows x 32 columns],
 1:        0     1     2    3    4     5    6     7    8    9   ...    22   23  \
 0.0  1.00  1.00  1.00  0.6  0.6  1.00  0.6  1.00  0.6  0.4  ...   0.6  0.6   
 1.0  0.25  0.25  0.25  0.4  0.4  0.25  0.4  0.25  0.4  0.6  ...   0.4  0.4   
 
        24   25   26    27    28    29    30    31  
 0.0  1.00  0.6  0.4  0.25  1.00  1.00  1.00  1.00  
 1.0  0.25  0.4  0.6  1.00  0.25  0.25  0.25  0.25  
 
 [2 rows x 32 columns]}

In [77]:
dataset = loadDataSet()
postingList = dataset[0]
class_vector = dataset[1]
list_feature = createFeature(postingList)
arr_data = wordVector(postingList, list_feature)  
#整理好的数据集和标签
X_data = DataFrame(arr_data)
y_data = DataFrame(class_vector)
#包含X_data,y_data的合并
data = pd.merge(X_data,y_data,left_index=True,right_index=True,how='outer')
display(X_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
