## 基本概念

**支持度**:
$$support(A \Rightarrow B)=P(A \cup B)$$

**置信度**:
$$confidence(A \Rightarrow B)=P(B | A)$$

**频繁项集、闭项集**:

项的集合称为**项集**，项集的出现频度简称为项集的**频度，支持度计数**。如果项集的相对支持度满足预定义的最小支持度阙值，则称为**频繁项集**。由于

$$confidence(A \Rightarrow B)=\frac{support(A \cup B)}{support(A)}=\frac{support\_count(A \cup B)}{support\_count(A)}$$

因此挖掘关联规则问题归结为挖掘频繁项集。

一般而言，关联规则的挖掘过程有两步:
* 找出所有频繁项集
* 由频繁项集产生强关联规则

从大型数据集中挖掘频繁项集的主要挑战是挖掘常常产生大量满足最小支持度阙值的项集，这是因为如果一个项集是频繁的，则它的子集也是频繁的。频繁项集的总个数为

$$C^1_n+C^2_n+...+C^n_n=2^n-1$$

如果不存在真超项集 Y 使得 Y 与 X 在 D 中具有相同的支持度计数，项集 X 是数据集 D 中的**闭频繁项集**。如果 X 是一个频繁项集，而且 X 的任意一个超集都是非频繁的，则称 X 是**最大频繁项集**。



## 频繁项集挖掘方法

### Apriori 算法

Apriori 使用一种称为逐层搜索的迭代方法，使用频繁 k 项集探索频繁 k+1 项集，直到不能再找到频繁 n 项集。为了提高频繁项集逐层产生的效率，使用先验性质压缩搜索空间。

**先验性质**:频繁项集的非空子集也一定是频繁的。

![](http://oh1zr9i3e.bkt.clouddn.com/18-4-21/61309405.jpg)


In [2]:
import numpy as np


def load_data():
    return np.array([[1, 2, 5], [2, 4], [2, 3], [1, 2, 4], [1, 3], [2, 3], [1, 3], [1, 2, 3, 5], [1, 2, 3]])


def create_c1(data):
    temp = []
    for row in data:
        for col in row:
            if [col] not in temp:
                temp.append([col])
    temp.sort(key=lambda x: x[0])
    return map(frozenset, temp)


def scan_d(d, c_k, min_support):
    ss_cnt = {}
    support_data = {}
    for can in c_k:
        for tid in d:
            if can.issubset(tid):
                if can not in ss_cnt:
                    ss_cnt[can] = 1
                else:
                    ss_cnt[can] += 1

    res_list = []
    for key in ss_cnt:
        if ss_cnt[key] >= min_support:
            res_list.append(key)
            support_data[key] = ss_cnt[key]

    print(res_list)
    return np.array(res_list), support_data


def apriori_gen(l):
    res_list = []
    for i in range(len(l)):
        value_i = list(l[i])
        for j in range(i + 1, len(l)):
            value_j = list(l[j])
            if value_i[:-1] == value_j[:-1]:
                temp = np.hstack((value_i, value_j[-1]))
                res_list.append(sorted(temp))

    print(res_list)
    return map(frozenset, res_list)


def apriori(d, min_support):
    ck = create_c1(d)
    print("---------L1--------")
    lk, support_data = scan_d(d, ck, min_support)
    k = 2
    L = []

    while len(lk) > 0:
        L.append(lk)
        print("---------C%d--------" % k)
        ck = apriori_gen(lk)
        print("---------L%d--------" % k)
        lk, sup_data = scan_d(d, ck, min_support)
        support_data.update(sup_data)
        k += 1

    return L, support_data


if __name__ == '__main__':
    test_data = load_data()
    l, support_data = apriori(test_data, 2)
    print(support_data)

---------L1--------
[frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]
---------C2--------
[[1, 2], [1, 3], [1, 4], [1, 5], [2, 3], [2, 4], [2, 5], [3, 4], [3, 5], [4, 5]]
---------L2--------
[frozenset({1, 2}), frozenset({1, 3}), frozenset({1, 5}), frozenset({2, 3}), frozenset({2, 4}), frozenset({2, 5})]
---------C3--------
[[1, 2, 3], [1, 2, 5], [1, 3, 5], [2, 3, 4], [2, 3, 5], [2, 4, 5]]
---------L3--------
[frozenset({1, 2, 3}), frozenset({1, 2, 5})]
---------C4--------
[[1, 2, 3, 5]]
---------L4--------
[]
{frozenset({1}): 6, frozenset({2}): 7, frozenset({3}): 6, frozenset({4}): 2, frozenset({5}): 2, frozenset({1, 2}): 4, frozenset({1, 3}): 4, frozenset({1, 5}): 2, frozenset({2, 3}): 4, frozenset({2, 4}): 2, frozenset({2, 5}): 2, frozenset({1, 2, 3}): 2, frozenset({1, 2, 5}): 2}


## 由频繁项集产生关联规则

In [8]:
import numpy as np


def load_data():
    return np.array([[1, 2, 5], [2, 4], [2, 3], [1, 2, 4], [1, 3], [2, 3], [1, 3], [1, 2, 3, 5], [1, 2, 3]])


def create_c1(data):
    temp = []
    for row in data:
        for col in row:
            if [col] not in temp:
                temp.append([col])
    temp.sort(key=lambda x: x[0])
    return map(frozenset, temp)


def scan_d(d, c_k, min_support):
    ss_cnt = {}
    sup_data = {}
    for can in c_k:
        for tid in d:
            if can.issubset(tid):
                if can not in ss_cnt:
                    ss_cnt[can] = 1
                else:
                    ss_cnt[can] += 1

    res_list = []
    for key in ss_cnt:
        if ss_cnt[key] >= min_support:
            res_list.append(key)
            sup_data[key] = ss_cnt[key]

    return np.array(res_list), sup_data


def apriori_gen(l):
    res_list = []
    for i in range(len(l)):
        value_i = list(l[i])
        for j in range(i + 1, len(l)):
            value_j = list(l[j])
            if value_i[:-1] == value_j[:-1]:
                temp = np.hstack((value_i, value_j[-1]))
                res_list.append(sorted(temp))
    return map(frozenset, res_list)


def apriori(d, min_support):
    ck = create_c1(d)
    lk, support_data = scan_d(d, ck, min_support)
    k = 2
    L = []

    while len(lk) > 0:
        L.append(lk)
        ck = apriori_gen(lk)
        lk, sup_data = scan_d(d, ck, min_support)
        support_data.update(sup_data)
        k += 1

    return L, support_data


def generate_rules(l, sup_data, min_conf=0.7):
    rule_list = []
    for i in range(1, len(l)):
        for freq_set in l[i]:
            h1 = [frozenset([item]) for item in freq_set]
            if i > 1:
                rules_conseq(freq_set, h1, sup_data, rule_list, min_conf)
            else:
                calc_conf(freq_set, h1, sup_data, rule_list, min_conf)
    return rule_list


def calc_conf(freq, h, sup_data, rl, min_conf):
    pruned_h = []

    for conseq in h:
        conf = sup_data[freq] / sup_data[freq - conseq]
        if conf >= min_conf:
            print(freq - conseq, "---->", conseq, end='')
            print('   conf=%f' % conf)
            rl.append((freq-conseq, conseq, conf))
            pruned_h.append(conseq)
    return pruned_h


def rules_conseq(freq, h, sup_data, rl, min_conf):
    m = len(h[0])
    if len(freq) > (m + 1):
        calc_conf(freq, h, sup_data, rl, min_conf)
        hmp1 = apriori_gen(h)
        hmp1 = calc_conf(freq, hmp1, sup_data, rl, min_conf)
        if len(hmp1) > 1:
            rules_conseq(freq, hmp1, sup_data, rl, min_conf)


if __name__ == '__main__':
    test_data = load_data()
    l, support_data = apriori(test_data, 2)
    rule = generate_rules(l, support_data)
    #print(rule)


frozenset({5}) ----> frozenset({1})   conf=1.000000
frozenset({4}) ----> frozenset({2})   conf=1.000000
frozenset({5}) ----> frozenset({2})   conf=1.000000
frozenset({2, 5}) ----> frozenset({1})   conf=1.000000
frozenset({1, 5}) ----> frozenset({2})   conf=1.000000
frozenset({5}) ----> frozenset({1, 2})   conf=1.000000
