# Data-方法-01 规则学习
- 关于规则学习方法的代码实现参考资料相对较少，一些部分的内容又何决策树方法很是相似；
- 这一篇对西瓜数据集2.0（周志华老师的《机器学习》p76），编写了一个最为简单的规则学习分类器；
- 希望通过这最为简单的示例，能简单实践规则学习。

In [1]:
# coding: utf-8

import pandas as pd
# 读入西瓜数据集

path = '../数据集/西瓜数据/西瓜2.csv'

datas = pd.read_csv(path)
datas

Unnamed: 0,编号,色泽,根蒂,敲声,纹理,脐部,触感,好瓜
0,1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,是
1,2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,是
2,3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,是
3,4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,是
4,5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,是
5,6,青绿,稍蜷,浊响,清晰,稍凹,软粘,是
6,7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,是
7,8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,是
8,9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,否
9,10,青绿,硬挺,清脆,清晰,平坦,软粘,否


In [2]:
# 评估规则覆盖率和准确率的函数
def evaluate(rule, datas):
    data_num = datas['编号'].size
    touch = right = 0
    keys = rule.keys()
    
    for i in range(0,data_num):
        for key in keys:
            if key != '好瓜' and key != '评估':
                if datas[key][i] == rule[key]:
                    continue
                else: 
                    break
            elif key != '评估':
                touch += 1
                if datas[key][i] == rule[key]:
                    right += 1
                    
    coverage = touch/data_num
    accuracy = 0 if touch==0 else right/touch
        
    return coverage, accuracy


# 示例
rule = {'色泽':'青绿','根蒂':'蜷缩','好瓜':'是'} #规则示例
print('规则:')
print(rule)

cov, acc = evaluate(rule,datas)
print('覆盖率：',cov, ' | 准确率：',acc)


# 评估规则集覆盖率和准确率的函数
def evaluate_rules(rules, datas):
    data_num = datas['编号'].size
    touch = right = 0
    
    for i in range(0,data_num):
        for rule in rules:
            is_cover = False
            keys = rule.keys()
            
            for key in keys:
                if key != '好瓜' and key != '评估':
                    if datas[key][i] == rule[key]:
                        continue
                    else: 
                        break
                elif key == '好瓜':
                    touch += 1
                    is_cover = True
                    if datas[key][i] == rule[key]:
                        right += 1
                    break
            
            if(is_cover): break
                    
    coverage = touch/data_num
    accuracy = 0 if touch==0 else right/touch
        
    return coverage, accuracy


# 示例
rules = [{'色泽':'青绿','根蒂':'蜷缩','好瓜':'是'},
        {'纹理':'清晰','好瓜':'是'}] #规则集示例
print('规则集:')
print(rules)

cov, acc = evaluate_rules(rules,datas)
print('覆盖率：',cov, ' | 准确率：',acc)

规则:
{'色泽': '青绿', '根蒂': '蜷缩', '好瓜': '是'}
覆盖率： 0.17647058823529413  | 准确率： 0.6666666666666666
规则集:
[{'色泽': '青绿', '根蒂': '蜷缩', '好瓜': '是'}, {'纹理': '清晰', '好瓜': '是'}]
覆盖率： 0.5882352941176471  | 准确率： 0.7


In [3]:
# 从训练集获取全部属性可能
def get_attributes(datas):
    data_num = datas['编号'].size
    attributes = {}

    for att in datas.columns.tolist():
        if att!='编号' and att!='好瓜': attributes[att]=set(datas[att])
    
    return attributes


# 从训练集构建初始逻辑文字
def init_rule(datas):
    attributes = get_attributes(datas)
    
    rules = []
    keys = attributes.keys()
    
    for key in keys:
        for att in attributes[key]:
            rule = {}
            rule[key] = att
            rule['好瓜'] = '是'
            cov, acc = evaluate(rule,datas)
            rule['评估'] = {'cov':cov, 'acc':acc}
            rules.append(rule)
            
            rule = {}
            rule[key] = att
            rule['好瓜'] = '否'
            cov, acc = evaluate(rule,datas)
            rule['评估'] = {'cov':cov, 'acc':acc}
            rules.append(rule)
    
    rules = sorted(rules, key = lambda i:(i['评估']['acc'], i['评估']['cov']), reverse=True)
    return rules

rules = init_rule(datas)
rules

[{'脐部': '平坦', '好瓜': '否', '评估': {'cov': 0.23529411764705882, 'acc': 1.0}},
 {'纹理': '模糊', '好瓜': '否', '评估': {'cov': 0.17647058823529413, 'acc': 1.0}},
 {'根蒂': '硬挺', '好瓜': '否', '评估': {'cov': 0.11764705882352941, 'acc': 1.0}},
 {'敲声': '清脆', '好瓜': '否', '评估': {'cov': 0.11764705882352941, 'acc': 1.0}},
 {'色泽': '浅白', '好瓜': '否', '评估': {'cov': 0.29411764705882354, 'acc': 0.8}},
 {'纹理': '稍糊', '好瓜': '否', '评估': {'cov': 0.29411764705882354, 'acc': 0.8}},
 {'纹理': '清晰',
  '好瓜': '是',
  '评估': {'cov': 0.5294117647058824, 'acc': 0.7777777777777778}},
 {'脐部': '凹陷',
  '好瓜': '是',
  '评估': {'cov': 0.4117647058823529, 'acc': 0.7142857142857143}},
 {'色泽': '乌黑',
  '好瓜': '是',
  '评估': {'cov': 0.35294117647058826, 'acc': 0.6666666666666666}},
 {'根蒂': '蜷缩', '好瓜': '是', '评估': {'cov': 0.47058823529411764, 'acc': 0.625}},
 {'敲声': '浊响', '好瓜': '是', '评估': {'cov': 0.5882352941176471, 'acc': 0.6}},
 {'敲声': '沉闷', '好瓜': '否', '评估': {'cov': 0.29411764705882354, 'acc': 0.6}},
 {'触感': '软粘', '好瓜': '否', '评估': {'cov': 0.294117647058823

In [4]:
# 从候选规则集中选出“最好”的规则
acc_threshold = 0.2 # 准确率若太低则不选
cov_threshold = 0.2 # 覆盖率若太低则不选

def choose_rule(rules):
    if len(rules)==0: return {}
    
    for rule in rules:
        if rule['评估']['acc']<acc_threshold: return {}
        if rule['评估']['cov']<cov_threshold: continue
        else: return rule
    
    return {}

rule = choose_rule(rules)  
rule

{'脐部': '平坦', '好瓜': '否', '评估': {'cov': 0.23529411764705882, 'acc': 1.0}}

In [5]:
# 从训练集中删除被某条规则覆盖的样例
def delete_data(datas, rule):
    data_num = datas['编号'].size
    new_datas = []
    keys = rule.keys()
    
    for i in range(0,data_num):
        is_cover = True
        
        for key in keys:
            if key=='好瓜' or key=='评估': continue
            
            if(datas[key][i]!=rule[key]):
                is_cover = False
                break
        
        if not is_cover: new_datas.append(datas.iloc[i].to_dict())
    
    return pd.DataFrame(new_datas)

new_datas = delete_data(datas, rule)
print(new_datas['编号'].size)
new_datas

13


Unnamed: 0,编号,色泽,根蒂,敲声,纹理,脐部,触感,好瓜
0,1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,是
1,2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,是
2,3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,是
3,4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,是
4,5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,是
5,6,青绿,稍蜷,浊响,清晰,稍凹,软粘,是
6,7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,是
7,8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,是
8,9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,否
9,13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,否


In [6]:
# 在一条规则基础上加入新的合取项
def add_literal(rule, datas):
    if(datas['编号'].size==0): return []
    
    rules = []
    attributes = get_attributes(datas)
    data_keys = datas.keys()
    rule_keys = rule.keys()
    
    for key in data_keys:
        if key=='编号': continue
        if key in rule_keys: continue
        
        for att in attributes[key]:
            new_rule = rule.copy()
            new_rule[key] = att
            cov, acc = evaluate(new_rule,datas)
            new_rule['评估'] = {'cov':cov, 'acc':acc}
            rules.append(new_rule)
    
    rules = sorted(rules, key = lambda i:(i['评估']['acc'], i['评估']['cov']), reverse=True)
    
    return rules

print(rule)
new_rules = add_literal(rule, datas)
new_rules

{'脐部': '平坦', '好瓜': '否', '评估': {'cov': 0.23529411764705882, 'acc': 1.0}}


[{'脐部': '平坦',
  '好瓜': '否',
  '评估': {'cov': 0.23529411764705882, 'acc': 1.0},
  '色泽': '浅白'},
 {'脐部': '平坦',
  '好瓜': '否',
  '评估': {'cov': 0.23529411764705882, 'acc': 1.0},
  '色泽': '乌黑'},
 {'脐部': '平坦',
  '好瓜': '否',
  '评估': {'cov': 0.23529411764705882, 'acc': 1.0},
  '色泽': '青绿'},
 {'脐部': '平坦',
  '好瓜': '否',
  '评估': {'cov': 0.23529411764705882, 'acc': 1.0},
  '根蒂': '硬挺'},
 {'脐部': '平坦',
  '好瓜': '否',
  '评估': {'cov': 0.23529411764705882, 'acc': 1.0},
  '根蒂': '稍蜷'},
 {'脐部': '平坦',
  '好瓜': '否',
  '评估': {'cov': 0.23529411764705882, 'acc': 1.0},
  '根蒂': '蜷缩'},
 {'脐部': '平坦',
  '好瓜': '否',
  '评估': {'cov': 0.23529411764705882, 'acc': 1.0},
  '敲声': '清脆'},
 {'脐部': '平坦',
  '好瓜': '否',
  '评估': {'cov': 0.23529411764705882, 'acc': 1.0},
  '敲声': '沉闷'},
 {'脐部': '平坦',
  '好瓜': '否',
  '评估': {'cov': 0.23529411764705882, 'acc': 1.0},
  '敲声': '浊响'},
 {'脐部': '平坦',
  '好瓜': '否',
  '评估': {'cov': 0.23529411764705882, 'acc': 1.0},
  '纹理': '模糊'},
 {'脐部': '平坦',
  '好瓜': '否',
  '评估': {'cov': 0.23529411764705882, 'acc': 1.0},
  '

In [38]:
# 训练过程
def rule_learn(path):
    # 读入训练集
    datas = pd.read_csv(path)
    
    # 中止条件
    cov_terminal = 0.8
    acc_terminal = 0.7
    
    # 最终规则集
    rules = []
    
    # 训练
    current_datas = datas.copy()
    current_cov = 0
    current_acc = 0
    
    while True:
        current_rules = rules.copy()
        rule = {}
        
        #初始化规则候选集
        init_rules = init_rule(current_datas) 
        if(not init_rules): break
        #选择最佳规则
        rule = choose_rule(init_rules)
        if(not rule): break
        print('######')
        print('1. 初始规则: ', rule)
        
        #不断迭代，增加逻辑文字
        while True:
            add_rule = {}
            cov = rule['评估']['cov']
            acc = rule['评估']['acc']
            #增加逻辑文字的候选集
            add_rules = add_literal(rule, datas)
            if not add_rules: break
            #选择最佳规则判断
            add_rule = choose_rule(add_rules)
            if not add_rule: break
            print('2. 增加逻辑文字: ', add_rule)
            if add_rule['评估']['acc']<=acc: break
            rule = add_rule.copy()
        
        # 将训练得到的规则加入规则集
        current_rules.append(rule)
        # 删除规则覆盖的情况
        current_datas = delete_data(current_datas, rule)
        if current_datas.size==0: break
        # 如果达到终止条件则结束
        cov, acc = evaluate_rules(current_rules, datas)
        rules = current_rules.copy()
        
        print('3. 规则集:', rules)
        if cov>cov_terminal: break
    
    cov, acc = evaluate_rules(rules, datas)
    return {'rules':rules, '评估':{'cov':cov, 'acc':acc}}


result = rule_learn(path)
result

######
1. 初始规则:  {'脐部': '平坦', '好瓜': '否', '评估': {'cov': 0.23529411764705882, 'acc': 1.0}}
2. 增加逻辑文字:  {'脐部': '平坦', '好瓜': '否', '评估': {'cov': 0.23529411764705882, 'acc': 1.0}, '色泽': '浅白'}
3. 规则集: [{'脐部': '平坦', '好瓜': '否', '评估': {'cov': 0.23529411764705882, 'acc': 1.0}}]
######
1. 初始规则:  {'纹理': '清晰', '好瓜': '是', '评估': {'cov': 0.6153846153846154, 'acc': 0.875}}
2. 增加逻辑文字:  {'纹理': '清晰', '好瓜': '是', '评估': {'cov': 0.5294117647058824, 'acc': 0.7777777777777778}, '色泽': '浅白'}
3. 规则集: [{'脐部': '平坦', '好瓜': '否', '评估': {'cov': 0.23529411764705882, 'acc': 1.0}}, {'纹理': '清晰', '好瓜': '是', '评估': {'cov': 0.6153846153846154, 'acc': 0.875}}]
######
1. 初始规则:  {'触感': '硬滑', '好瓜': '否', '评估': {'cov': 0.8, 'acc': 1.0}}
2. 增加逻辑文字:  {'触感': '硬滑', '好瓜': '否', '评估': {'cov': 0.7058823529411765, 'acc': 0.5}, '色泽': '浅白'}
3. 规则集: [{'脐部': '平坦', '好瓜': '否', '评估': {'cov': 0.23529411764705882, 'acc': 1.0}}, {'纹理': '清晰', '好瓜': '是', '评估': {'cov': 0.6153846153846154, 'acc': 0.875}}, {'触感': '硬滑', '好瓜': '否', '评估': {'cov': 0.8, 'acc': 1.0

{'rules': [{'脐部': '平坦',
   '好瓜': '否',
   '评估': {'cov': 0.23529411764705882, 'acc': 1.0}},
  {'纹理': '清晰', '好瓜': '是', '评估': {'cov': 0.6153846153846154, 'acc': 0.875}},
  {'触感': '硬滑', '好瓜': '否', '评估': {'cov': 0.8, 'acc': 1.0}}],
 '评估': {'cov': 0.9411764705882353, 'acc': 0.9375}}

In [40]:
# 展示最终结果

def show_result(result):
    print('[规则集：')
    num = 0
    for rule in result['rules']:
        num += 1
        print('  规则'+str(num)+": ",end='')
        is_first = True
        for key in rule.keys():
            if key!='好瓜' and key!='评估':
                if is_first: 
                    is_first = False
                    print(key,'=',rule[key],end='')
                else:
                    print(' ∧ ',key,'=',rule[key],end='')
        print('','-->','好瓜','=',rule['好瓜'])
    
    print(']')
    print('[评估结果：')
    print('  覆盖率：',result['评估']['cov'])
    print('  准确率：',result['评估']['acc'])
    print(']')
    
    return

show_result(result)

[规则集：
  规则1: 脐部 = 平坦 --> 好瓜 = 否
  规则2: 纹理 = 清晰 --> 好瓜 = 是
  规则3: 触感 = 硬滑 --> 好瓜 = 否
]
[评估结果：
  覆盖率： 0.9411764705882353
  准确率： 0.9375
]
