# 11.1 关联分析

# 11.2 Apriori原理

# 11.3 使用Apriori算法来发现频繁集

## 11.3.1 生成候选项集

**程序清单11-1** Apriori算法中的辅助函数

In [1]:
def load_data_set():
    return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]

In [2]:
def createC1(data_set):
    C1 = []
    for transaction in data_set:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    return list(map(frozenset, C1))

In [3]:
def scanD(D, Ck, min_support):
    ss_cnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if not can in ss_cnt:
                    ss_cnt[can] = 1
                else:
                    ss_cnt[can] += 1
    num_items = float(len(D))
    ret_list = []
    support_data = {}
    for key in ss_cnt:
        support = ss_cnt[key]/num_items
        if support >= min_support:
            ret_list.insert(0, key)
        support_data[key] = support
    return ret_list, support_data

In [4]:
data_set = load_data_set()
data_set

[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

In [5]:
C1 = createC1(data_set)
C1

[frozenset({1}),
 frozenset({2}),
 frozenset({3}),
 frozenset({4}),
 frozenset({5})]

In [6]:
D = list(map(set, data_set))
D

[{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]

In [7]:
L1, supp_data0 = scanD(D, C1, 0.5)
L1

[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]

## 11.3.2 组织完整的Apriori算法

**程序清单11-2** Apriori算法

In [8]:
def apriori_gen(Lk, k):
    ret_list = []
    len_Lk = len(Lk)
    for i in range(len_Lk):
        for j in range(i+1, len_Lk):
            L1 = list(Lk[i])[:k-2]
            L2 = list(Lk[j])[:k-2]
            L1.sort()
            L2.sort()
            if L1==L2:
                ret_list.append(Lk[i] | Lk[j])
    return ret_list

In [9]:
def apriori(data_set, min_support=0.5):
    C1 = createC1(data_set)
    D = list(map(set, data_set))
    L1, support_data = scanD(D, C1, min_support)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = apriori_gen(L[k-2], k)
        Lk, sup_k = scanD(D, Ck, min_support)
        support_data.update(sup_k)
        L.append(Lk)
        k += 1
    return L, support_data

In [10]:
L, supp_data = apriori(data_set)

In [11]:
L

[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})],
 [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})],
 [frozenset({2, 3, 5})],
 []]

In [12]:
L[0]

[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]

In [13]:
L[1]

[frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})]

In [14]:
L[2]

[frozenset({2, 3, 5})]

In [15]:
L[3]

[]

In [16]:
apriori_gen(L[0], 2)

[frozenset({2, 5}),
 frozenset({3, 5}),
 frozenset({1, 5}),
 frozenset({2, 3}),
 frozenset({1, 2}),
 frozenset({1, 3})]

In [17]:
L, supp_data = apriori(data_set, min_support=0.7)

In [18]:
L

[[frozenset({5}), frozenset({2}), frozenset({3})], [frozenset({2, 5})], []]

# 11.4 从频繁项集中挖掘关联规则

**程序清单11-3** 关联规则生成函数

In [19]:
def generate_rules(L, support_data, min_conf=0.7):
    big_rule_list = []
    for i in range(1, len(L)):
        for freq_set in L[i]:
            H1 = [frozenset([item]) for item in freq_set]
            if (i > 1):
                rules_from_conseq(freq_set, H1, support_data, big_rule_list, min_conf)
            else:
                calc_conf(freq_set, H1, support_data, big_rule_list, min_conf)
    return big_rule_list

In [20]:
def calc_conf(freq_set, H, support_data, brl, min_conf=0.7):
    pruned_H = []
    for conseq in H:
        conf = support_data[freq_set]/support_data[freq_set-conseq]
        if conf >= min_conf:
            print(freq_set-conseq,'-->',conseq,'conf:',conf)
            brl.append((freq_set-conseq, conseq, conf))
            pruned_H.append(conseq)
    return pruned_H

In [21]:
def rules_from_conseq(freq_set, H, support_data, brl, min_conf=0.7):
    m = len(H[0])
    if (len(freq_set) > (m + 1)):
        Hmp1 = apriori_gen(H, m+1)
        Hmp1 = calc_conf(freq_set, Hmp1, support_data, brl, min_conf)
        if (len(Hmp1) > 1):
            rules_from_conseq(freq_set, Hmp1, supp_data, brl, min_conf)

In [22]:
L, supp_data = apriori(data_set, min_support=0.5)

In [23]:
rules = generate_rules(L, supp_data, min_conf=0.7)

frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({1}) --> frozenset({3}) conf: 1.0


In [24]:
rules

[(frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({1}), frozenset({3}), 1.0)]

In [25]:
rules = generate_rules(L, supp_data, min_conf=0.5)

frozenset({3}) --> frozenset({2}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({5}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({5}) conf: 0.6666666666666666
frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({3}) --> frozenset({1}) conf: 0.6666666666666666
frozenset({1}) --> frozenset({3}) conf: 1.0
frozenset({5}) --> frozenset({2, 3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({2, 5}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3, 5}) conf: 0.6666666666666666


In [26]:
rules

[(frozenset({3}), frozenset({2}), 0.6666666666666666),
 (frozenset({2}), frozenset({3}), 0.6666666666666666),
 (frozenset({5}), frozenset({3}), 0.6666666666666666),
 (frozenset({3}), frozenset({5}), 0.6666666666666666),
 (frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({3}), frozenset({1}), 0.6666666666666666),
 (frozenset({1}), frozenset({3}), 1.0),
 (frozenset({5}), frozenset({2, 3}), 0.6666666666666666),
 (frozenset({3}), frozenset({2, 5}), 0.6666666666666666),
 (frozenset({2}), frozenset({3, 5}), 0.6666666666666666)]

# 11.5 示例：发现国会投票中的模式

Unable to register an API key.

So I skip this part.

# 11.6 示例：发现毒蘑菇的相似特征

In [27]:
mush_dat_set = [line.split() for line in open('mushroom.dat').readlines()]

In [28]:
L, supp_data = apriori(mush_dat_set, min_support=0.3)

In [29]:
for item in L[1]:
    if item.intersection('2'):
        print(item)

frozenset({'2', '28'})
frozenset({'53', '2'})
frozenset({'2', '23'})
frozenset({'2', '34'})
frozenset({'2', '36'})
frozenset({'2', '59'})
frozenset({'2', '63'})
frozenset({'2', '67'})
frozenset({'76', '2'})
frozenset({'85', '2'})
frozenset({'86', '2'})
frozenset({'90', '2'})
frozenset({'2', '93'})
frozenset({'2', '39'})


In [30]:
for item in L[3]:
    if item.intersection('2'):
        print(item)

frozenset({'85', '2', '39', '28'})
frozenset({'85', '34', '2', '28'})
frozenset({'85', '59', '2', '28'})
frozenset({'85', '2', '63', '28'})
frozenset({'85', '2', '90', '28'})
frozenset({'2', '86', '39', '28'})
frozenset({'34', '2', '86', '28'})
frozenset({'59', '2', '86', '28'})
frozenset({'2', '86', '63', '28'})
frozenset({'85', '2', '86', '28'})
frozenset({'2', '86', '90', '28'})
frozenset({'34', '2', '90', '28'})
frozenset({'59', '34', '2', '28'})
frozenset({'34', '2', '63', '28'})
frozenset({'59', '2', '63', '28'})
frozenset({'34', '2', '39', '28'})
frozenset({'59', '2', '39', '28'})
frozenset({'2', '63', '39', '28'})
frozenset({'53', '85', '34', '2'})
frozenset({'53', '85', '2', '86'})
frozenset({'53', '85', '2', '90'})
frozenset({'53', '85', '2', '39'})
frozenset({'53', '85', '2', '28'})
frozenset({'53', '34', '2', '86'})
frozenset({'53', '2', '86', '90'})
frozenset({'53', '2', '86', '39'})
frozenset({'53', '2', '86', '28'})
frozenset({'53', '34', '2', '90'})
frozenset({'53', '2'

# 11.7 本章小结