In [49]:
import pandas as pd

# 读取数据
data = pd.read_excel('/Users/zhuyanze/Desktop/data.xls')

# 选择需要的属性列
selected_columns = [
    '肝气郁结证型系数',
    '热毒蕴结证型系数',
    '冲任失调证型系数',
    '气血两虚证型系数',
    '脾胃虚弱证型系数',
    '肝肾阴虚证型系数',
    'TNM分期'
]

# 保留所需的属性列
reduced_data = data[selected_columns]

# 将处理后的数据保存到新的Excel文件中
reduced_data.to_excel('reduced_data.xlsx', index=False)

In [50]:
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer

# 读取处理后的数据
data = pd.read_excel('reduced_data.xlsx')

# 为每个属性分配一个字母标签
column_labels = ['A', 'B', 'C', 'D', 'E', 'F']

# 在循环中分别应用KBinsDiscretizer
for idx, column in enumerate(data.columns[:-1]):  # 不包括'TNM分期'列
    # 初始化KBinsDiscretizer
    discretizer = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')

    # 使用当前属性拟合并转换离散化器
    discretized_column = discretizer.fit_transform(data[[column]])

    # 获取bin边缘值
    bin_edges = discretizer.bin_edges_[0]

    # 将原始属性列替换为离散化后的类别标签
    bin_labels = [f'{column_labels[idx]}{i + 1}' for i in range(discretizer.n_bins)]
    data[column] = pd.cut(data[column], bins=len(bin_labels), labels=bin_labels)

    # 输出分箱结果及其对应的系数属性和数据区间
    print(f'{column} ({column_labels[idx]}):')
    for i in range(discretizer.n_bins):
        lower_bound = bin_edges[i]
        upper_bound = bin_edges[i + 1]
        print(f'{bin_labels[i]}: [{lower_bound}, {upper_bound})')
    print()

# 将分箱离散化后的数据保存到新的Excel文件中
data.to_excel('discretized_data.xlsx', index=False)

肝气郁结证型系数 (A):
A1: [0.026, 0.1455)
A2: [0.1455, 0.265)
A3: [0.265, 0.3845)
A4: [0.3845, 0.504)

热毒蕴结证型系数 (B):
B1: [0.0, 0.195)
B2: [0.195, 0.39)
B3: [0.39, 0.585)
B4: [0.585, 0.78)

冲任失调证型系数 (C):
C1: [0.067, 0.20274999999999999)
C2: [0.20274999999999999, 0.33849999999999997)
C3: [0.33849999999999997, 0.47424999999999995)
C4: [0.47424999999999995, 0.61)

气血两虚证型系数 (D):
D1: [0.059, 0.18225000000000002)
D2: [0.18225000000000002, 0.3055)
D3: [0.3055, 0.42875)
D4: [0.42875, 0.552)

脾胃虚弱证型系数 (E):
E1: [0.003, 0.13375)
E2: [0.13375, 0.2645)
E3: [0.2645, 0.39525)
E4: [0.39525, 0.526)

肝肾阴虚证型系数 (F):
F1: [0.016, 0.16375)
F2: [0.16375, 0.3115)
F3: [0.3115, 0.45925)
F4: [0.45925, 0.607)



In [51]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# 从离散化后的数据文件中读取数据
data = pd.read_excel('discretized_data.xlsx')

# 转换数据格式
transactions = data.values.tolist()

# 创建布尔数据集
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
data_bool = pd.DataFrame(te_ary, columns=te.columns_)

# 计算频繁项集
min_support = 0.10  # 设置最小支持度为10%
frequent_itemsets = apriori(data_bool, min_support=min_support, use_colnames=True)

# 计算关联规则
min_confidence = 0.75  # 设置最小置信度为75%
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=min_confidence)

# 过滤规则，仅保留涉及TNM分期属性的关联规则
filtered_rules = rules[rules['consequents'].apply(lambda x: any([item.startswith('H') for item in x]))]

# 输出关联规则到Excel文件
filtered_rules.to_excel('association_rules_tnm.xlsx', index=False)

# 打印关联规则
print(filtered_rules)

     antecedents consequents  antecedent support  consequent support  \
3       (F3, A2)        (H4)            0.219355            0.446237   
8       (F3, B1)        (H4)            0.144086            0.446237   
9       (F3, C2)        (H4)            0.184946            0.446237   
11      (F3, D2)        (H4)            0.204301            0.446237   
12  (F3, D2, A2)        (H4)            0.136559            0.446237   

     support  confidence      lift  leverage  conviction  zhangs_metric  
3   0.167742    0.764706  1.713678  0.069858    2.353495       0.533482  
8   0.117204    0.813433  1.822874  0.052908    2.968172       0.527408  
9   0.145161    0.784884  1.758896  0.062632    2.574252       0.529366  
11  0.160215    0.784211  1.757387  0.069048    2.566221       0.541629  
12  0.103226    0.755906  1.693957  0.042288    2.268644       0.474458  
