## 关联分析目标：
    * 发现频繁项集
    * 发现关联规则

### 使用Apriori算法来发现频繁项集（满足最小支持度）
    * 输入数据集和最小支持度
    * 首先，生成所有单个物品的项集列表
    * 扫描交易记录，查看哪些项集满足最小支持度要求，去掉不满足的
    * 对剩下的集合进行组合以生成包含两个元素的项集
    * 重新扫描交易记录，去掉不满足最小支持度的项集
    * 重复上述过程，直到所有项集都被去掉。

In [1]:
def loadDataSet():
    '''生成数据集'''
    dataSet = [[1, 3, 4],
              [2, 3, 5],
              [1, 2, 3, 5],
              [2, 5]]
    return dataSet

In [2]:
def createC1(dataSet):
    '''构建大小为 1 的所有候选项的集合'''
    C1 = []                                 # 用于存储所有不重复的项值
    for data in dataSet:                    # 遍历每一条数据
        for item in data:                   # 遍历数据中的每一个项
            if not [item] in C1:             
                C1.append([item])
    C1.sort()
    return list(map(frozenset, C1))               # 对C1中的每个项构建一个不变集合

In [3]:
def scanDataSet(dataSet, Ck, minSupport):
    '''
    D:数据集，Ck候选项集列表，minSupport：感兴趣项集的最小支持度
    返回：包含满足最小支持度项的列表，包含支持度项和该项出现次数的字典
    '''
    ssCnt = {}                                 # 用于存放集合及其出现次数，键为集合，值为出现的次数
    for data in dataSet:                       # 每一个数据  
        for item in Ck:                        # 遍历C1中的值
            if item.issubset(data):            # 如果ck中的值是数据的子集
                ssCnt[item] = ssCnt.get(item, 0) + 1         #那么，保存该值，并记录这个值在数据集中出现的次数
    
    m = float(len(dataSet))                    # 总的样本数
    Lk = []                                    # 空列表，用于存放满足最小支持度的项 
    supportData = {}                           # 空列字典，用于存放满足最小支持度的项和该项出现的次数 
    for key in ssCnt.keys():
        support = ssCnt[key]/m
        if support >= minSupport:
            Lk.insert(0, key)
        supportData[key] = support
    return Lk, supportData                     # 

In [4]:
# 测试
dataSet = loadDataSet()
print('dataSet:', dataSet)

C1 = createC1(dataSet)
print('C1:', list(C1))

L1, supportData = scanDataSet(dataSet, C1, 0.5)
print('L1:', L1)
print('supportData:', supportData)

dataSet: [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
C1: [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]
L1: [frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]
supportData: {frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({4}): 0.25, frozenset({2}): 0.75, frozenset({5}): 0.75}


In [5]:
def createCk(Lk, k):
    '''
    创建Ck，将满足最小支持度的集合进行组合
    输入：频繁项集列表Lk，项集元素个数k
    输出：Ck
    '''
    Ck = []
    for i in range(len(Lk)):
        for j in range(i+1, len(Lk)):
            #print('i,j:',i,j)
            L1 = list(Lk[i])[:k-2]
            #print('L1:',L1)
            L2 = list(Lk[j])[:k-2]
            #print('L2:',L2)
            L1.sort()
            L2.sort()
            if L1==L2:
                Ck.append(Lk[i] | Lk[j])
            #print(Ck)
    return Ck

In [6]:
def apriori(dataSet, minSupport=0.5):
    C1 = createC1(dataSet)
    L1, supportData = scanDataSet(dataSet, C1, minSupport)
    
    L = [L1]
    k = 2
    while(len(L[k-2])>0):
        Ck = createCk(L[k-2], k)
        Lk, supK = scanDataSet(dataSet, Ck, minSupport)
        L.append(Lk)
        supportData.update(supK)
        k = k+1
    return L, supportData

In [7]:
L, supportData = apriori(dataSet)
print('minSupport:0.5', L)
#print(supportData)

minSupport:0.5 [[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})], [frozenset({2, 3, 5})], []]


### 从频繁项集中挖掘关联规则（满足可信度）

In [8]:
def calcConf(freqSet, H, supportData, RuleList, minConf=0.7):
    '''
    计算最小可信度, 对规则进行评估
    输入：频繁项集freqSet，组成频繁项集的单元素列表H
    '''
    prunedH = []
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq]
        if conf >= minConf:
            print(freqSet-conseq, '==>', conseq, 'conf:', conf)
            RuleList.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH

In [9]:
def rulesFromConseq(freqSet, H, supportData, RuleList, minConf=0.7):
    '''
    输入：频繁项集freqSet，元素列表H，项集支持度，
    生成候选规则集合
    '''
    m = len(H[0])
    if (len(freqSet)>(m+1)):
        Hmp1 = createCk(H, m+1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, RuleList, minConf)
        if (len(Hmp1)>1):
            rulesFromConseq(freqSet, Hmp1, supportData, RuleList, minConf)

In [10]:
def generateRules(L, supportData, minConf=0.7):
    '''
    输入：频繁项集列表L，包含{项集：支持度}的字典，最小可信度阈值
    输出：包含可信度的规则列表
    
    '''
    RuleList = []
    for i in range(1, len(L)):                                 #关联规则无法在单元素集合中产生
        for freqSet in L[i]:                                   #对每个含多元素的频繁项集           
            
            H1 = [frozenset([item]) for item in freqSet]       #创建单元素集合列表
            if i>1:                                            # i>1时，freqSet中的频繁项集内元素个数大于2。需要对H1进行合并
                rulesFromConseq(freqSet, H1, supportData, RuleList, minConf)   
            else:                                              # i=1时，freqSet中频繁项集内元素个数等于2，可以直接计算可信度 
                calcConf(freqSet, H1, supportData, RuleList, minConf)
    return RuleList

In [11]:
L, supportData = apriori(dataSet,0.5)
generateRules(L, supportData, minConf=0.7)

frozenset({5}) ==> frozenset({2}) conf: 1.0
frozenset({2}) ==> frozenset({5}) conf: 1.0
frozenset({1}) ==> frozenset({3}) conf: 1.0


[(frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({1}), frozenset({3}), 1.0)]

In [12]:
rules = generateRules(L, supportData, minConf=0.5)

frozenset({3}) ==> frozenset({2}) conf: 0.6666666666666666
frozenset({2}) ==> frozenset({3}) conf: 0.6666666666666666
frozenset({5}) ==> frozenset({3}) conf: 0.6666666666666666
frozenset({3}) ==> frozenset({5}) conf: 0.6666666666666666
frozenset({5}) ==> frozenset({2}) conf: 1.0
frozenset({2}) ==> frozenset({5}) conf: 1.0
frozenset({3}) ==> frozenset({1}) conf: 0.6666666666666666
frozenset({1}) ==> frozenset({3}) conf: 1.0
frozenset({5}) ==> frozenset({2, 3}) conf: 0.6666666666666666
frozenset({3}) ==> frozenset({2, 5}) conf: 0.6666666666666666
frozenset({2}) ==> frozenset({3, 5}) conf: 0.6666666666666666


## 发现毒蘑菇的相似特征

In [13]:
mushDatSet = [line.split() for line in open('data/mushroom.dat').readlines()]
L, suppData = apriori(mushDatSet, minSupport=0.3)
for item in L[1]:
    if item.intersection('2'):
        print(item)

frozenset({'28', '2'})
frozenset({'53', '2'})
frozenset({'23', '2'})
frozenset({'34', '2'})
frozenset({'2', '36'})
frozenset({'59', '2'})
frozenset({'63', '2'})
frozenset({'67', '2'})
frozenset({'76', '2'})
frozenset({'85', '2'})
frozenset({'86', '2'})
frozenset({'2', '90'})
frozenset({'93', '2'})
frozenset({'39', '2'})


In [14]:
for item in L[3]:
    if item.intersection('2'):
        print(item)

frozenset({'28', '59', '34', '2'})
frozenset({'28', '85', '34', '2'})
frozenset({'28', '90', '34', '2'})
frozenset({'28', '59', '85', '2'})
frozenset({'28', '59', '90', '2'})
frozenset({'28', '63', '34', '2'})
frozenset({'28', '59', '63', '2'})
frozenset({'28', '85', '63', '2'})
frozenset({'28', '86', '63', '2'})
frozenset({'28', '39', '63', '2'})
frozenset({'28', '86', '34', '2'})
frozenset({'28', '86', '59', '2'})
frozenset({'28', '86', '85', '2'})
frozenset({'28', '86', '90', '2'})
frozenset({'28', '90', '85', '2'})
frozenset({'28', '39', '34', '2'})
frozenset({'28', '39', '59', '2'})
frozenset({'28', '39', '85', '2'})
frozenset({'28', '39', '86', '2'})
frozenset({'28', '39', '90', '2'})
frozenset({'53', '90', '34', '2'})
frozenset({'53', '85', '34', '2'})
frozenset({'53', '86', '85', '2'})
frozenset({'53', '90', '85', '2'})
frozenset({'53', '39', '85', '2'})
frozenset({'53', '28', '85', '2'})
frozenset({'53', '86', '34', '2'})
frozenset({'53', '86', '90', '2'})
frozenset({'53', '86