In [241]:
import numpy as np
import pandas as pd

In [242]:
data = pd.read_csv("goods2.csv",header = None)
data = np.array(data)
data = [[i for i in row if not pd.isna(i)] for row in data]
data

[['good1', 'good3', 'good4'],
 ['good2', 'good3', 'good5'],
 ['good1', 'good2', 'good3', 'good5'],
 ['good2', 'good5']]

In [243]:
# 获取1候选集
def creat_C1(data):
    C1 = set()
    for i in data:
        for item in i:
            item_set = frozenset([item])
            C1.add(item_set)
    return C1

In [244]:
create_C1(data)

{frozenset({'good1'}),
 frozenset({'good5'}),
 frozenset({'good4'}),
 frozenset({'good3'}),
 frozenset({'good2'})}

In [245]:
# 判断候选集中的项集是否满足先验原理
def is_apriori(Ck_item, Lksub1):
    for item in Ck_item:
        sub_Ck = Ck_item - frozenset([item])
        if sub_Ck not in Lksub1:
            return False
    return True

In [246]:
# 根据k-1频繁项集生成满足先验原理的k候选集，项集合并按照顺序、相同
def creat_Ck(Lksub1, k):
    Ck = set()
    list_Lksub1 = list(Lksub1)
    for i in range(len(Lksub1)):
        for j in range(i,len(Lksub1)):
            l1 = list(list_Lksub1[i])
            l2 = list(list_Lksub1[j])
            l1.sort()
            l2.sort()
            if l1[0:k-2] == l2[0:k-2]:
                Ck_item = list_Lksub1[i] | list_Lksub1[j]
            if is_apriori(Ck_item,Lksub1):
                Ck.add(Ck_item)
    return Ck

In [247]:
# 根据k-候选项集生成k-频繁项集
# 根据支持度计数进行删除
def generate_Lk_by_Ck(data, Ck, min_support, support_data):
    Lk = set()
    item_count = {}
    for i in data:
        for item in Ck:
            if item.issubset(i):
                if item not in item_count:
                    item_count[item]=1
                else:
                    item_count[item]+=1
    len_data = len(data)
    for item in item_count:
        if item_count[item]/len_data >= min_support:
            Lk.add(item)
            support_data[item] = item_count[item]/len_data
    return Lk

In [248]:
# a = [1,2,34,4]
# b = a.copy()
# b.append(33)
# a

In [249]:
# 生成频繁项集
def generate_L(data, k, min_support):
    support_data = {}
    C1 = creat_C1(data)
    L1 = generate_Lk_by_Ck(data, C1, min_support, support_data)
    Lksub1 = L1.copy()
    L = []
    L.append(Lksub1)
    for i in range(2,k+1):
        Ci = creat_Ck(Lksub1, i)
        Li = generate_Lk_by_Ck(data, Ci, min_support, support_data)
        Lksub1 = Li.copy()
        L.append(Lksub1)
    return L, support_data

In [250]:
# 生成关联规则
def generate_rules(L, support_data, min_conf):
    big_rule_list = []
    sub_set_list = []
    for i in range(0, len(L)):
        for freq_set in L[i]:
            for sub_set in sub_set_list:
                if sub_set.issubset(freq_set):
                    conf = support_data[freq_set] / support_data[freq_set - sub_set]
                    big_rule = [freq_set - sub_set, sub_set, conf]
                    if conf >= min_conf and big_rule not in big_rule_list:
                        big_rule_list.append(big_rule)
            sub_set_list.append(freq_set)
    return big_rule_list

In [251]:
if __name__ == "__main__":
    
    # 获取并处理数据集，去除空值
    data = pd.read_csv("goods2.csv",header = None)
    data = np.array(data)
    data_set = [[i for i in row if not pd.isna(i)] for row in data]
    
    # 生成频繁项集
    L, support_data = generate_L(data_set, k=3, min_support=0.5)
    
    # 生成关联规则
    big_rules_list = generate_rules(L, support_data, min_conf=0.7)
    
    # 输出频繁项集和关联规则
    for Lk in L:
        for freq_set in Lk:
            print(list(freq_set),'support:',support_data[freq_set])
    print ('\n\n关联规则')
    for item in big_rules_list:
        print(list(item[0]), "-->", list(item[1]),"conf: ", item[2])

['good2'] support: 0.75
['good3'] support: 0.75
['good1'] support: 0.5
['good5'] support: 0.75
['good2', 'good5'] support: 0.75
['good3', 'good1'] support: 0.5
['good3', 'good5'] support: 0.5
['good2', 'good3'] support: 0.5
['good2', 'good3', 'good5'] support: 0.5


关联规则
['good5'] --> ['good2'] conf:  1.0
['good2'] --> ['good5'] conf:  1.0
['good1'] --> ['good3'] conf:  1.0
['good3', 'good5'] --> ['good2'] conf:  1.0
['good2', 'good3'] --> ['good5'] conf:  1.0
