In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# 1. 读取数据
df = pd.read_excel('tree.xlsx', sheet_name='Sheet1')

# 2. 文本预处理函数：只保留 "ind_xxx" 这样的特征名
# 例如将 "ind_4e < 1 or missing" 转化为 "ind_4e"
def preprocess_rule(text):
    import re
    # 正则表达式匹配以 ind 开头的特征名
    features = re.findall(r"(ind[\w_]+)", str(text))
    return " ".join(features)

# 3. 应用预处理
df['features_text'] = df['DetailedSplit'].apply(preprocess_rule)

# 4. 构建矩阵 (One-Hot Encoding)
# binary=True 表示我们只关心“有没有用到这个特征”，不关心用了几次
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(df['features_text'])

# 5. 转化为 DataFrame 查看 (这就是“规则-特征”矩阵)
rule_feature_matrix = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# 将原始的 Point（分数）拼回来，方便对照
rule_feature_matrix['Score_Points'] = df['Point']

print(rule_feature_matrix.head())

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# ==========================================
# 步骤 0: 准备工作
# ==========================================
# 假设这是你所有的 17 个指标名称（请把这里换成你真实的列表）
# 即使某些指标在 rules 里一次都没出现，它们也会作为全 0 列出现在结果中
all_17_indicators = [
    "ind_4e", "ind_13b", "ind_3a_1", "ind_12f", "ind_2a_1", 
    "ind_13a_1", "ind_6", "ind_7", "ind_8", "ind_9", 
    "ind_10", "ind_11", "ind_14", "ind_15", "ind_16", 
    "ind_17", "ind_unused_example" # 确保这里列出了全部 17 个
]

# 读取数据
df = pd.read_excel('tree.xlsx', sheet_name='Sheet1')

# ==========================================
# 步骤 1: 文本预处理 (保持 210 行不变)
# ==========================================
# 我们只提取特征名，不拆分行
def extract_features(text):
    import re
    if pd.isna(text): return ""
    # 提取所有 ind_ 开头的词
    feats = re.findall(r"(ind[\w_]+)", str(text))
    return " ".join(feats)

df['feature_text'] = df['DetailedSplit'].apply(extract_features)

# ==========================================
# 步骤 2: 构建矩阵 (强制使用所有 17 个指标)
# ==========================================
# 关键点：使用 vocabulary 参数！
# 这告诉程序：“只关注这17个词，其他的我不要；没出现的词也要给我留列位置。”
vectorizer = CountVectorizer(binary=True, vocabulary=all_17_indicators)

# 生成矩阵
X = vectorizer.fit_transform(df['feature_text'])

# 转化为 DataFrame，列名就是我们指定的顺序
matrix_df = pd.DataFrame(X.toarray(), columns=all_17_indicators)

# ==========================================
# 步骤 3: 拼接结果 (加上 Rule 和 Point)
# ==========================================
# axis=1 表示左右横向拼接
final_df = pd.concat([
    df[['DetailedSplit', 'Point']],  # 第一列显示 Rule，第二列显示分数
    matrix_df                        # 后面跟着 17 列指标矩阵
], axis=1)

# ==========================================
# 步骤 4: 检查与保存
# ==========================================
print(f"最终矩阵维度: {final_df.shape}") 
# 预期输出: (210, 2 + 17) = (210, 19)

# 预览一下
print(final_df.head())

# 保存结果
final_df.to_excel("final_rule_matrix.xlsx", index=False)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# ==========================================
# 步骤 0: 准备工作
# ==========================================
# 您的 17 个指标名称 (请确保这里是全的)
all_17_indicators = [
    "ind_4e", "ind_13b", "ind_3a_1", "ind_12f", "ind_2a_1", 
    "ind_13a_1", "ind_6", "ind_7", "ind_8", "ind_9", 
    "ind_10", "ind_11", "ind_14", "ind_15", "ind_16", 
    "ind_17", "ind_unused_example" 
]

# 读取数据
# 假设您的第一列是 Index，我们用 index_col=0 读取它，或者把它读作普通列
# 这里建议读作普通列，方便我们保存到结果里
df = pd.read_excel('tree.xlsx', sheet_name='Sheet1')

# 【关键排查点 1】打印刚读进来时的行数
print(f"原始数据行数: {df.shape[0]}") 
# 如果这里已经是 421，说明 Excel 文件本身就是脏的（之前存错了）

# 假设第一列叫 'Index' (如果不是，请把 df.columns[0] 改成您的列名)
index_col_name = df.columns[0] 
print(f"我们将使用列 '{index_col_name}' 作为原始索引追踪")

# ==========================================
# 步骤 1: 文本预处理 (绝对不进行拆分)
# ==========================================
def extract_features(text):
    import re
    if pd.isna(text): return ""
    feats = re.findall(r"(ind[\w_]+)", str(text))
    return " ".join(feats)

df['feature_text'] = df['DetailedSplit'].apply(extract_features)

# ==========================================
# 步骤 2: 构建矩阵
# ==========================================
vectorizer = CountVectorizer(binary=True, vocabulary=all_17_indicators)
X = vectorizer.fit_transform(df['feature_text'])
matrix_df = pd.DataFrame(X.toarray(), columns=all_17_indicators)

# ==========================================
# 步骤 3: 拼接结果 (带上原始 Index)
# ==========================================
final_df = pd.concat([
    df[[index_col_name, 'DetailedSplit', 'Point']], # 把 Index 列放最前面
    matrix_df
], axis=1)

# ==========================================
# 步骤 4: 检查
# ==========================================
print(f"最终矩阵维度: {final_df.shape}")

# 如果行数不对，我们通过 Index 看看是谁重复了
if final_df.shape[0] > 210:
    print("\n警告：行数异常增加！正在查找重复的 Index...")
    duplicates = final_df[final_df.duplicated(subset=[index_col_name], keep=False)]
    print(duplicates[[index_col_name, 'DetailedSplit']].head(10))
    print("\n如果看到上面的 Index 有重复，说明数据源里这些行被拆分了。")

# 保存
final_df.to_excel("final_rule_matrix_with_index.xlsx", index=False)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 1. 读取矩阵
df = pd.read_excel("final_rule_matrix.xlsx")

# 2. 提取只有 0/1 的特征部分 (假设从第3列开始是特征)
# 您的列是: DetailedSplit, Point, ind_1, ind_2 ...
feature_cols = df.columns[2:] 
X = df[feature_cols]

# 3. 绘制热力图
plt.figure(figsize=(15, 10))
sns.heatmap(X, cbar=False, cmap="Blues")
plt.title("Rule-Feature Heatmap (Dark Blue = Feature Used)")
plt.xlabel("Indicators")
plt.ylabel("Rule ID")
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.pyplot as plt

# 1. 计算聚类连接矩阵 (使用 Jaccard 距离，适合 binary 数据)
# method='average' 或 'complete' 通常效果较好
Z = linkage(X, method='average', metric='jaccard')

# 2. 绘制树状图 (帮助您决定切成几类)
plt.figure(figsize=(12, 6))
dendrogram(Z)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Rule Index")
plt.ylabel("Distance")
plt.axhline(y=0.7, c='r', ls='--', lw=2) # 画一条辅助线，看看切在这里会分出几类
plt.show()

# 3. 真正打标签 (假设我们根据树状图决定切成 8 类)
# t=8 表示我们要 8 个簇
labels = fcluster(Z, t=8, criterion='maxclust')
df['Cluster_Label'] = labels

print(df['Cluster_Label'].value_counts())

In [None]:
# 对每个簇进行聚合分析
cluster_profile = df.groupby('Cluster_Label')[feature_cols].mean()

# 只要某个特征在该簇的出现率超过 80% (0.8)，我们就认为它是该簇的“核心特征”
for cluster_id in cluster_profile.index:
    row = cluster_profile.loc[cluster_id]
    core_features = row[row > 0.8].index.tolist()
    
    # 计算该簇的平均风险分
    avg_score = df[df['Cluster_Label'] == cluster_id]['Point'].mean()
    
    print(f"=== Cluster {cluster_id} (风险分: {avg_score:.1f}) ===")
    print(f"核心特征: {core_features}")
    print(f"规则数量: {len(df[df['Cluster_Label'] == cluster_id])}")
    print("-" * 30)

In [None]:
plt.figure(figsize=(10, 6))
sns.stripplot(x="Cluster_Label", y="Point", data=df, jitter=0.2, size=5)
plt.title("Score Distribution by Cluster")
plt.axhline(0, color='red', linestyle='--') # 0分线
plt.show()