## bayes_classfier
每个元组都被表示为n维 属性 向量x = ( x 1 , x 1 , . . . , x n )
一共有k个类别: c1, c2, c3, c4

P(c|X) = (P(X|c)*P(c)) / P(X)

reference: https://blog.csdn.net/qq_25948717/article/details/81744277

In [1]:
import random
import operator

In [2]:
datasets = {
    'banana': {'long': 400, 'not_long': 100, 'sweet': 350, 'not_sweet': 150, 'yellow': 450, 'not_yellow': 50},
    'orange': {'long': 0, 'not_long': 300, 'sweet': 150, 'not_sweet': 150, 'yellow': 300, 'not_yellow': 0},
    'other_fruit': {'long': 100, 'not_long': 100, 'sweet': 150, 'not_sweet': 50, 'yellow': 50, 'not_yellow': 150}
}

### 计算各种水果的总数

In [3]:
def count_total_fruit(data):
    """
    return {'banana': 500, 'orange': 300, 'other_fruit': 200}, 1000
    """
    count = {}
    total = 0
    for fruit in data:
        # fruit : banana orange other_fruit, 利用数据的特殊性，求每个类别的样本数
        count[fruit] = data[fruit]['sweet'] + data[fruit]['not_sweet']
        total += count[fruit]

    return count, total

In [4]:
count, total = count_total_fruit(datasets)
count["banana"], count, total

(500, {'banana': 500, 'orange': 300, 'other_fruit': 200}, 1000)

In [5]:
# 获取字典类型的key
for k, v in count.items():
    print(k, v)

labels = []
for label in count:
    labels.append(label)
labels

banana 500
orange 300
other_fruit 200


['banana', 'orange', 'other_fruit']

### 计算先验概率 P(C)

In [6]:
def cal_priori_probs(data):
    categories, total = count_total_fruit(data)
    prior_probs = {}

    for label in categories:
        prior_prob = categories[label] / total
        prior_probs[label] = prior_prob
    return prior_probs

In [7]:
priori_probs = cal_priori_probs(datasets)
priori_probs, priori_probs.keys(), list(priori_probs.keys())[1]

({'banana': 0.5, 'orange': 0.3, 'other_fruit': 0.2},
 dict_keys(['banana', 'orange', 'other_fruit']),
 'orange')

### 似然 P(x_i|c_k) == 各特征值在已知水果下的概率

In [8]:
def likelihood_prob(data):

    count, _ = count_total_fruit(data)
    likelihood = {}

    for fruit in data:
        """创建一个临时字典，存储各个特征值的概率"""
        attr_prob = {}
        for attr in data[fruit]:
            # 计算各个特征值在已知水果下的概率
            # attr : long, not long, sweet, not sweet, yellow, not yellow
            attr_prob[attr] = data[fruit][attr] / count[fruit]
        # 存储每个水果下各个特征的信息
        likelihood[fruit] = attr_prob

    return likelihood


In [9]:
LikeHold = likelihood_prob(datasets)
LikeHold
# LikeHold['banana']

{'banana': {'long': 0.8,
  'not_long': 0.2,
  'sweet': 0.7,
  'not_sweet': 0.3,
  'yellow': 0.9,
  'not_yellow': 0.1},
 'orange': {'long': 0.0,
  'not_long': 1.0,
  'sweet': 0.5,
  'not_sweet': 0.5,
  'yellow': 1.0,
  'not_yellow': 0.0},
 'other_fruit': {'long': 0.5,
  'not_long': 0.5,
  'sweet': 0.75,
  'not_sweet': 0.25,
  'yellow': 0.25,
  'not_yellow': 0.75}}

### (分母) 计算特征的概率对分类结果的影响 P(X)

In [10]:
def evidence_prob(data):
    """
        return {'long':50%...}
    """
    attrs = list(data['banana'].keys())  # 获得所有水果特征
    count, total = count_total_fruit(data)
    evidence_prob = {}

    # 计算各种特征的概率
    for attr in attrs:
        attr_total = 0
        for fruit in data:
            # 计算所有水果中，相同属性下水果的总量
            attr_total += data[fruit][attr]
        evidence_prob[attr] = attr_total / total
    return evidence_prob

In [11]:
Evidence_prob = evidence_prob(datasets)
Evidence_prob

{'long': 0.5,
 'not_long': 0.5,
 'sweet': 0.65,
 'not_sweet': 0.35,
 'yellow': 0.8,
 'not_yellow': 0.2}

## 朴素贝叶斯分类器
数据转换

In [12]:
class navie_bayes_classifer:
    """ 初始化朴素贝叶斯， 实例化时调用  P(c|X) = (P(X|c) * P(c)) / P(X)   -> target : 极大后验概率(MAP)
    """
    def __init__(self, data=datasets):
        self._data = datasets
        self._labels = [k for k in self._data.keys()]
        self._priori_prob = cal_priori_probs(self._data)  # 先验概率  P(c)
        self._likelihood_prob = likelihood_prob(self._data)  # 联合概率: 各特征值在已知水果下的概率  P(X|c)
        self._evidence_prob = evidence_prob(self._data)  # P(X)

    def get_label(self, length, sweetness, color):
        """获得某一组特征值的类别"""
        # self._attrs = [length, sweetness, color]
        self._attrs = list(self._data[self._labels[0]].keys())   # problem
        result = {}

        for label in self._labels:
            # 取某种水果的占比, 先验概率
            prob = self._priori_prob[label]

            for attr in self._attrs:
                # 单个水果的某个特征概率除以总的某个特征概率 再乘以某水果占比率
                prob *= self._likelihood_prob[label][attr] / self._evidence_prob[attr]
            result[label] = prob

        return result

## 产生测试数据
测试贝叶斯分类器的预测能力

In [13]:
def random_attr(pair):
    #生成0-1之间的随机数
    return pair[random.randint(0, 1)]

def generate_attrs(test_data_length):
    # 特征值的取值集合
    sets = [('long', 'not_long'), ('sweet', 'not_sweet'), ('yellow', 'not_yellow')]
    test_data = []
    for i in range(test_data_length):
        # 使用map函数来生成一组特征值
        test_data.append(list(map(random_attr, sets)))
    return test_data

# random_attr(('long', 'not_long'))
test_data_length = 20
generate_attrs(test_data_length)

[['long', 'not_sweet', 'not_yellow'],
 ['not_long', 'sweet', 'not_yellow'],
 ['long', 'not_sweet', 'yellow'],
 ['not_long', 'sweet', 'not_yellow'],
 ['long', 'sweet', 'not_yellow'],
 ['long', 'sweet', 'yellow'],
 ['long', 'sweet', 'yellow'],
 ['not_long', 'not_sweet', 'yellow'],
 ['long', 'not_sweet', 'yellow'],
 ['long', 'not_sweet', 'not_yellow'],
 ['long', 'sweet', 'yellow'],
 ['long', 'not_sweet', 'not_yellow'],
 ['long', 'not_sweet', 'not_yellow'],
 ['long', 'not_sweet', 'yellow'],
 ['not_long', 'sweet', 'not_yellow'],
 ['long', 'sweet', 'not_yellow'],
 ['not_long', 'not_sweet', 'not_yellow'],
 ['long', 'not_sweet', 'not_yellow'],
 ['not_long', 'not_sweet', 'not_yellow'],
 ['not_long', 'sweet', 'not_yellow']]

In [14]:
sets = [('long', 'not_long'), ('sweet', 'not_sweet'), ('yellow', 'not_yellow')]
sets[0], sets[0][0], list(map(random_attr, sets))

(('long', 'not_long'), 'long', ['long', 'sweet', 'not_yellow'])

## 使用朴素贝叶斯分类器进行测试

In [15]:
def main(test_data_length):
    length = test_data_length
    test_data = generate_attrs(length)
    classifer = navie_bayes_classifer()

    for data in test_data:
        # 预测属于哪种水果的概率
        result = classifer.get_label(*data)
        # 对后验概率排序，输出概率最大的标签
        label = str(sorted(result.items(), key = operator.itemgetter(1), reverse=True)[0][0])

        print(f"特征值： {data}")
        print(f"预测结果：{result}")
        print(f"类别：{label}\n")

if __name__ == '__main__':
    test_data_length = 20
    main(test_data_length)


特征值： ['long', 'not_sweet', 'yellow']
预测结果：{'banana': 0.1661538461538462, 'orange': 0.0, 'other_fruit': 0.19316620879120877}
类别：other_fruit

特征值： ['not_long', 'not_sweet', 'not_yellow']
预测结果：{'banana': 0.1661538461538462, 'orange': 0.0, 'other_fruit': 0.19316620879120877}
类别：other_fruit

特征值： ['long', 'sweet', 'not_yellow']
预测结果：{'banana': 0.1661538461538462, 'orange': 0.0, 'other_fruit': 0.19316620879120877}
类别：other_fruit

特征值： ['not_long', 'not_sweet', 'not_yellow']
预测结果：{'banana': 0.1661538461538462, 'orange': 0.0, 'other_fruit': 0.19316620879120877}
类别：other_fruit

特征值： ['not_long', 'sweet', 'yellow']
预测结果：{'banana': 0.1661538461538462, 'orange': 0.0, 'other_fruit': 0.19316620879120877}
类别：other_fruit

特征值： ['long', 'sweet', 'not_yellow']
预测结果：{'banana': 0.1661538461538462, 'orange': 0.0, 'other_fruit': 0.19316620879120877}
类别：other_fruit

特征值： ['long', 'sweet', 'not_yellow']
预测结果：{'banana': 0.1661538461538462, 'orange': 0.0, 'other_fruit': 0.19316620879120877}
类别：other_fruit

特征值：