In [None]:
import pandas as pd

data = pd.read_csv('Groceries_dataset.csv')
display(data.head())
display(data.info())

In [None]:
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)
data = data.sort_values(by=['Member_number', 'Date'])

transactions = data.groupby(['Member_number', 'Date'])['itemDescription'].apply(set).reset_index(name='Transaction')

transactions = transactions.sort_values(by=['Member_number', 'Date']).reset_index(drop=True)

transactions.head()

In [None]:
# 函数运行时间的装饰器
import time

def timer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__}: Support Threshold = {min_support:.2f}, Time = {end_time - start_time:.4f} seconds")
        return result, end_time - start_time
    return wrapper

@timer
def Apriori(df, min_support):
    frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)
    return frequent_itemsets

# def runner(func, dfs, min_supports, *args, **kwargs):
    

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth
import time

# 选择前100条事务
sample_transactions = transactions['Transaction'].iloc[:100].tolist()

# 转换事务集为布尔矩阵
te = TransactionEncoder()
te_ary = te.fit(sample_transactions).transform(sample_transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

# 设置支持度阈值
min_supports = [0.01, 0.02, 0.03, 0.04, 0.05, 0.1]
apriori_time = []
fpgrowth_time = []

for min_support in min_supports:
    # Apriori算法
    start_time = time.time()
    frequent_itemsets_apriori = apriori(df,min_support=min_support, use_colnames=True)
    t = time.time() - start_time
    apriori_time.append(t)    
    print(f'Apriori: Support Threshold = {min_support:.2f}, Time = {t} seconds')
for min_support in min_supports:
    # FP-growth算法
    start_time = time.time()
    frequent_itemsets_fpgrowth = fpgrowth(df,min_support=min_support, use_colnames=True)
    t = time.time() - start_time
    fpgrowth_time.append(t)
    print(f'FP-growth: Support Threshold = {min_support:.2f}, Time = {t} seconds')


In [None]:
import matplotlib.pyplot as plt

plt.plot(min_supports, apriori_time, marker='o', linestyle='-', color='b', label='Apriori')
plt.plot(min_supports, fpgrowth_time, marker='o', linestyle='-', color='r', label='FP-growth')

plt.title("Runtime Comparison")
plt.xlabel("Support Threshold")
plt.ylabel("Runtime (seconds)")

plt.legend()
plt.show()


In [None]:
# 定义事务规模
transaction_sizes = [100, 500, 1000]
results = []

for size in transaction_sizes:
    sample_transactions = transactions['Transaction'].iloc[:size].tolist()
    te_ary = te.fit(sample_transactions).transform(sample_transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    
    # Apriori算法
    start_time = time.time()
    apriori(df, min_support=min_support, use_colnames=True)
    apriori_time = time.time() - start_time
    
    # FP-growth算法
    start_time = time.time()
    fpgrowth(df, min_support=min_support, use_colnames=True)
    fpgrowth_time = time.time() - start_time
    
    results.append((size, apriori_time, fpgrowth_time))

# 输出对比结果
for size, apriori_time, fpgrowth_time in results:
    print(f"事务数: {size}, Apriori时间: {apriori_time:.4f}秒, FP-growth时间: {fpgrowth_time:.4f}秒")


In [None]:
import matplotlib.pyplot as plt

transaction_sizes, apriori_time, fpgrowth_time = zip(*results)

plt.plot(transaction_sizes, apriori_time, marker='o', linestyle='-', color='b', label='Apriori')
plt.plot(transaction_sizes, fpgrowth_time, marker='o', linestyle='-', color='r', label='FP-growth')

plt.title("Runtime Comparison")
plt.xlabel("Support Threshold")
plt.ylabel("Runtime (seconds)")

plt.legend()
plt.show()


In [None]:
from mlxtend.frequent_patterns import association_rules

# 设置置信度阈值
min_confidence = 0.5
sample_transactions = transactions['Transaction'].iloc[:100].tolist()

te = TransactionEncoder()
te_ary = te.fit(sample_transactions).transform(sample_transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)

# 生成强关联规则
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence, num_itemsets=100)

# 计算评估指标
rules['lift'] = rules['lift']
rules['all_confidence'] = rules[['antecedent support', 'consequent support']].min(axis=1)
rules['max_confidence'] = rules[['confidence', 'lift']].max(axis=1)
rules['kluc'] = rules['confidence'] * rules['lift']
rules['cosine'] = rules['confidence'] / (rules['antecedent support'] * rules['consequent support'])**0.5

# 查看前几条规则和评估指标
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'all_confidence', 'max_confidence', 'kluc', 'cosine']]
