In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# 1. 读取数据
df = pd.read_excel('tree.xlsx', sheet_name='Sheet1')

# 2. 文本预处理函数：只保留 "ind_xxx" 这样的特征名
# 例如将 "ind_4e < 1 or missing" 转化为 "ind_4e"
def preprocess_rule(text):
    import re
    # 正则表达式匹配以 ind 开头的特征名
    features = re.findall(r"(ind[\w_]+)", str(text))
    return " ".join(features)

# 3. 应用预处理
df['features_text'] = df['DetailedSplit'].apply(preprocess_rule)

# 4. 构建矩阵 (One-Hot Encoding)
# binary=True 表示我们只关心“有没有用到这个特征”，不关心用了几次
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(df['features_text'])

# 5. 转化为 DataFrame 查看 (这就是“规则-特征”矩阵)
rule_feature_matrix = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# 将原始的 Point（分数）拼回来，方便对照
rule_feature_matrix['Score_Points'] = df['Point']

print(rule_feature_matrix.head())

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# ==========================================
# 步骤 0: 准备工作
# ==========================================
# 假设这是你所有的 17 个指标名称（请把这里换成你真实的列表）
# 即使某些指标在 rules 里一次都没出现，它们也会作为全 0 列出现在结果中
all_17_indicators = [
    "ind_4e", "ind_13b", "ind_3a_1", "ind_12f", "ind_2a_1", 
    "ind_13a_1", "ind_6", "ind_7", "ind_8", "ind_9", 
    "ind_10", "ind_11", "ind_14", "ind_15", "ind_16", 
    "ind_17", "ind_unused_example" # 确保这里列出了全部 17 个
]

# 读取数据
df = pd.read_excel('tree.xlsx', sheet_name='Sheet1')

# ==========================================
# 步骤 1: 文本预处理 (保持 210 行不变)
# ==========================================
# 我们只提取特征名，不拆分行
def extract_features(text):
    import re
    if pd.isna(text): return ""
    # 提取所有 ind_ 开头的词
    feats = re.findall(r"(ind[\w_]+)", str(text))
    return " ".join(feats)

df['feature_text'] = df['DetailedSplit'].apply(extract_features)

# ==========================================
# 步骤 2: 构建矩阵 (强制使用所有 17 个指标)
# ==========================================
# 关键点：使用 vocabulary 参数！
# 这告诉程序：“只关注这17个词，其他的我不要；没出现的词也要给我留列位置。”
vectorizer = CountVectorizer(binary=True, vocabulary=all_17_indicators)

# 生成矩阵
X = vectorizer.fit_transform(df['feature_text'])

# 转化为 DataFrame，列名就是我们指定的顺序
matrix_df = pd.DataFrame(X.toarray(), columns=all_17_indicators)

# ==========================================
# 步骤 3: 拼接结果 (加上 Rule 和 Point)
# ==========================================
# axis=1 表示左右横向拼接
final_df = pd.concat([
    df[['DetailedSplit', 'Point']],  # 第一列显示 Rule，第二列显示分数
    matrix_df                        # 后面跟着 17 列指标矩阵
], axis=1)

# ==========================================
# 步骤 4: 检查与保存
# ==========================================
print(f"最终矩阵维度: {final_df.shape}") 
# 预期输出: (210, 2 + 17) = (210, 19)

# 预览一下
print(final_df.head())

# 保存结果
final_df.to_excel("final_rule_matrix.xlsx", index=False)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# ==========================================
# 步骤 0: 准备工作
# ==========================================
# 您的 17 个指标名称 (请确保这里是全的)
all_17_indicators = [
    "ind_4e", "ind_13b", "ind_3a_1", "ind_12f", "ind_2a_1", 
    "ind_13a_1", "ind_6", "ind_7", "ind_8", "ind_9", 
    "ind_10", "ind_11", "ind_14", "ind_15", "ind_16", 
    "ind_17", "ind_unused_example" 
]

# 读取数据
# 假设您的第一列是 Index，我们用 index_col=0 读取它，或者把它读作普通列
# 这里建议读作普通列，方便我们保存到结果里
df = pd.read_excel('tree.xlsx', sheet_name='Sheet1')

# 【关键排查点 1】打印刚读进来时的行数
print(f"原始数据行数: {df.shape[0]}") 
# 如果这里已经是 421，说明 Excel 文件本身就是脏的（之前存错了）

# 假设第一列叫 'Index' (如果不是，请把 df.columns[0] 改成您的列名)
index_col_name = df.columns[0] 
print(f"我们将使用列 '{index_col_name}' 作为原始索引追踪")

# ==========================================
# 步骤 1: 文本预处理 (绝对不进行拆分)
# ==========================================
def extract_features(text):
    import re
    if pd.isna(text): return ""
    feats = re.findall(r"(ind[\w_]+)", str(text))
    return " ".join(feats)

df['feature_text'] = df['DetailedSplit'].apply(extract_features)

# ==========================================
# 步骤 2: 构建矩阵
# ==========================================
vectorizer = CountVectorizer(binary=True, vocabulary=all_17_indicators)
X = vectorizer.fit_transform(df['feature_text'])
matrix_df = pd.DataFrame(X.toarray(), columns=all_17_indicators)

# ==========================================
# 步骤 3: 拼接结果 (带上原始 Index)
# ==========================================
final_df = pd.concat([
    df[[index_col_name, 'DetailedSplit', 'Point']], # 把 Index 列放最前面
    matrix_df
], axis=1)

# ==========================================
# 步骤 4: 检查
# ==========================================
print(f"最终矩阵维度: {final_df.shape}")

# 如果行数不对，我们通过 Index 看看是谁重复了
if final_df.shape[0] > 210:
    print("\n警告：行数异常增加！正在查找重复的 Index...")
    duplicates = final_df[final_df.duplicated(subset=[index_col_name], keep=False)]
    print(duplicates[[index_col_name, 'DetailedSplit']].head(10))
    print("\n如果看到上面的 Index 有重复，说明数据源里这些行被拆分了。")

# 保存
final_df.to_excel("final_rule_matrix_with_index.xlsx", index=False)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 1. 读取矩阵
df = pd.read_excel("final_rule_matrix.xlsx")

# 2. 提取只有 0/1 的特征部分 (假设从第3列开始是特征)
# 您的列是: DetailedSplit, Point, ind_1, ind_2 ...
feature_cols = df.columns[2:] 
X = df[feature_cols]

# 3. 绘制热力图
plt.figure(figsize=(15, 10))
sns.heatmap(X, cbar=False, cmap="Blues")
plt.title("Rule-Feature Heatmap (Dark Blue = Feature Used)")
plt.xlabel("Indicators")
plt.ylabel("Rule ID")
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.pyplot as plt

# 1. 计算聚类连接矩阵 (使用 Jaccard 距离，适合 binary 数据)
# method='average' 或 'complete' 通常效果较好
Z = linkage(X, method='average', metric='jaccard')

# 2. 绘制树状图 (帮助您决定切成几类)
plt.figure(figsize=(12, 6))
dendrogram(Z)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Rule Index")
plt.ylabel("Distance")
plt.axhline(y=0.7, c='r', ls='--', lw=2) # 画一条辅助线，看看切在这里会分出几类
plt.show()

# 3. 真正打标签 (假设我们根据树状图决定切成 8 类)
# t=8 表示我们要 8 个簇
labels = fcluster(Z, t=8, criterion='maxclust')
df['Cluster_Label'] = labels

print(df['Cluster_Label'].value_counts())

In [None]:
# 对每个簇进行聚合分析
cluster_profile = df.groupby('Cluster_Label')[feature_cols].mean()

# 只要某个特征在该簇的出现率超过 80% (0.8)，我们就认为它是该簇的“核心特征”
for cluster_id in cluster_profile.index:
    row = cluster_profile.loc[cluster_id]
    core_features = row[row > 0.8].index.tolist()
    
    # 计算该簇的平均风险分
    avg_score = df[df['Cluster_Label'] == cluster_id]['Point'].mean()
    
    print(f"=== Cluster {cluster_id} (风险分: {avg_score:.1f}) ===")
    print(f"核心特征: {core_features}")
    print(f"规则数量: {len(df[df['Cluster_Label'] == cluster_id])}")
    print("-" * 30)

In [None]:
plt.figure(figsize=(10, 6))
sns.stripplot(x="Cluster_Label", y="Point", data=df, jitter=0.2, size=5)
plt.title("Score Distribution by Cluster")
plt.axhline(0, color='red', linestyle='--') # 0分线
plt.show()

In [None]:
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
import numpy as np

# ==========================================
# 1. 读取数据 & 准备
# ==========================================
# 读取上一步生成的矩阵文件
df = pd.read_excel("final_rule_matrix.xlsx")

# 提取特征列（假设从第3列 'ind_xxx' 开始是特征，前两列是 DetailedSplit 和 Point）
# 请根据您的实际列名调整，确保 feature_cols 只包含 0/1 的指标列
feature_cols = df.columns[2:] 
X = df[feature_cols]

# ==========================================
# 2. 执行聚类 (复用之前的逻辑)
# ==========================================
# 使用 Jaccard 距离进行层次聚类
Z = linkage(X, method='average', metric='jaccard')

# 设定聚类参数 (这里演示用 distance = 0.7 自动切分，您也可以改用 t=8, criterion='maxclust')
threshold = 0.7
labels = fcluster(Z, t=threshold, criterion='distance')

# 将聚类结果打标到原始数据上
df.insert(0, 'Cluster_Label', labels) # 把 Cluster_Label 插到第一列，显眼

# 按 簇ID 和 分数(Point) 排序，方便阅读
df = df.sort_values(by=['Cluster_Label', 'Point'], ascending=[True, False])

# ==========================================
# 3. 生成“簇画像”概览 (Summary)
# ==========================================
summary_list = []
unique_labels = sorted(df['Cluster_Label'].unique())

for label in unique_labels:
    # 取出该簇的所有数据
    sub_df = df[df['Cluster_Label'] == label]
    
    # 计算核心特征：在该簇中出现频率 > 80% 的特征
    # mean() 会对 0/1 列求均值，即出现频率
    feat_freq = sub_df[feature_cols].mean()
    core_feats = feat_freq[feat_freq > 0.8].index.tolist()
    
    summary_list.append({
        'Cluster_Label': label,
        'Rule_Count': len(sub_df),            # 规则数量
        'Avg_Point': sub_df['Point'].mean(),  # 平均分
        'Min_Point': sub_df['Point'].min(),   # 最低分
        'Max_Point': sub_df['Point'].max(),   # 最高分
        'Core_Features': ", ".join(core_feats) # 核心特征列表
    })

summary_df = pd.DataFrame(summary_list)

# ==========================================
# 4. 保存到 Excel (包含两个 Sheet)
# ==========================================
output_file = "Fraud_Rules_Cluster_Report.xlsx"

with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    # Sheet 1: 概览画像
    summary_df.to_excel(writer, sheet_name='Cluster_Summary', index=False)
    
    # Sheet 2: 详细数据 (整行数据都在这里)
    df.to_excel(writer, sheet_name='Cluster_Details', index=False)

print(f"报告已生成: {output_file}")
print("Sheet 'Cluster_Summary': 包含每个簇的画像统计")
print("Sheet 'Cluster_Details': 包含按簇归类的所有原始规则详情")

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from scipy.cluster.hierarchy import linkage, fcluster

# ==========================================
# 1. 读取原始数据 & 保留 Index
# ==========================================
# 读取 Excel
# index_col=None 确保第一列被读作普通列，而不是索引（防止丢失）
df = pd.read_excel("tree.xlsx", sheet_name='Sheet1', index_col=None)

# 获取第一列的列名（假设第一列就是您说的 Index）
index_col_name = df.columns[0]
print(f"检测到原始 Index 列名为: {index_col_name}")

# 为了防止后续处理丢失，我们显式地把它重命名为 'Original_Index' (可选，也可保留原名)
# 这里我们选择保留原名，但在输出时调整位置

# ==========================================
# 2. 准备聚类特征
# ==========================================
def extract_features(text):
    import re
    if pd.isna(text): return ""
    feats = re.findall(r"(ind[\w_]+)", str(text))
    return " ".join(feats)

# 提取特征用于计算
rule_text = df['DetailedSplit'].apply(extract_features)

# 构建矩阵
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(rule_text)
feature_names = vectorizer.get_feature_names_out()

# ==========================================
# 3. 执行聚类 (距离阈值 0.7)
# ==========================================
Z = linkage(X.toarray(), method='average', metric='jaccard')
threshold = 0.7
labels = fcluster(Z, t=threshold, criterion='distance')

# ==========================================
# 4. 整理结果表格
# ==========================================
# 插入 Cluster_Label 到第 1 列
if 'Cluster_Label' in df.columns:
    df.drop(columns=['Cluster_Label'], inplace=True)
df.insert(0, 'Cluster_Label', labels)

# === 关键步骤：调整列顺序 ===
# 我们希望顺序是: Cluster_Label -> Original_Index -> Point -> 其他列
# 先把 index 列移到第 2 列的位置 (紧跟 Cluster_Label)
cols = list(df.columns)
# 移除 index 列和 Cluster_Label (防止重复)
cols.remove('Cluster_Label')
cols.remove(index_col_name)
# 重新组合：Cluster_Label, Index列, 其他列...
new_order = ['Cluster_Label', index_col_name] + cols
df = df[new_order]

# 按 簇ID 正序，分数 倒序 排序
df_sorted = df.sort_values(by=['Cluster_Label', 'Point'], ascending=[True, False])

# ==========================================
# 5. 生成画像概览
# ==========================================
X_df = pd.DataFrame(X.toarray(), columns=feature_names)
X_df['Cluster_Label'] = labels

summary_list = []
unique_labels = sorted(df['Cluster_Label'].unique())

for label in unique_labels:
    sub_df = df[df['Cluster_Label'] == label]
    
    # 画像计算
    sub_X = X_df[X_df['Cluster_Label'] == label]
    feat_freq = sub_X.drop(columns=['Cluster_Label']).mean()
    core_feats = feat_freq[feat_freq > 0.8].index.tolist()
    
    summary_list.append({
        'Cluster_Label': label,
        'Rule_Count': len(sub_df),
        'Avg_Point': sub_df['Point'].mean(),
        'Core_Features': ", ".join(core_feats),
        # 顺便展示该簇里包含的几个 Index 样例，方便快速定位
        'Sample_Indices': str(sub_df[index_col_name].head(5).tolist()) + "..." 
    })

summary_df = pd.DataFrame(summary_list)

# ==========================================
# 6. 保存
# ==========================================
output_file = "Fraud_Rules_Clustered_With_Index.xlsx"

with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    summary_df.to_excel(writer, sheet_name='Cluster_Summary', index=False)
    df_sorted.to_excel(writer, sheet_name='Cluster_Details', index=False)

print(f"文件已生成: {output_file}")
print(f"Sheet 'Cluster_Details' 的第 1 列是 Cluster ID，第 2 列是原始 {index_col_name}。")

# Role
You are a Senior Fraud Risk Expert with 20 years of experience in financial institutions. You specialize in interpreting machine learning model rules (specifically XGBoost) to uncover underlying fraud patterns and business anomalies.

# Task
I will provide 210 fraud detection rules extracted from an XGBoost model. Each rule consists of multiple logical conditions based on specific Indicators and an associated risk score (Points). 
Your task is to perform **Semantic Clustering** on these 210 rules based on their **actual business meanings**.

# Context & Data Dictionary
Before analyzing, please study the following data dictionary carefully to understand what each indicator represents:


# Requirements
1. **Focus on Logic, Not Thresholds**: Do not separate rules just because of minor threshold differences (e.g., < 1 vs < 1.01). Focus on the **core business intent**.
2. **Identify Feature Combinations**: Pay attention to indicators that frequently appear together (e.g., High Inventory + Over-financing).
3. **Cluster into Scenarios**: Group these 210 rules into **8 to 12 distinct Risk Scenarios**.

# Output Format
Please provide the results in the following format for each cluster:

## Cluster X: [Professional Business Name, e.g., Inventory Inflation & Over-financing]
- **Risk Logic**: [A concise summary of the fraud/risk pattern. E.g., The entity maintains excessive inventory while simultaneously over-borrowing, suggesting potential loan fraud or asset overstatement.]
- **Core Indicators**: [The primary 2-3 indicators defining this cluster.]
- **Severity**: [High/Medium/Low based on the average Points.]
- **Included Rule Indices**: [List all Rule Indices belonging to this group, e.g., 201, 208, 209...]

# Data
Below are the 210 rules:
[Paste your CSV data here, including Index, Points, and DetailedSplit columns]

In [None]:
import pandas as pd
import re

def extract_indicators(text):
    """从 DetailedSplit 文本中提取指标名称 (ind_xxx)"""
    if pd.isna(text):
        return []
    # 正则表达式匹配 ind 开头的指标
    # \w+ 匹配字母、数字、下划线
    return list(set(re.findall(r"(ind[\w_]+)", str(text))))

def generate_rule_tables(rule_lists_dict, master_df, index_col='Index'):
    """
    生成规则汇总表(Table 1)和详情表(Table 2)
    
    Args:
        rule_lists_dict (dict): { 'List Name': [rule_index_1, rule_index_2, ...] }
        master_df (pd.DataFrame): 包含所有规则的原始数据表
        index_col (str): 原始数据中代表 Rule Index 的列名
        
    Returns:
        tuple: (summary_df, details_df)
    """
    
    # ------------------------------------------
    # 准备工作
    # ------------------------------------------
    summary_data = []
    details_data = []
    
    # 确保索引列是字符串或统一格式，方便匹配
    master_df[index_col] = master_df[index_col].astype(str)
    
    # 遍历每一个 List
    for list_name, indices in rule_lists_dict.items():
        # 统一转为字符串列表
        indices = [str(i) for i in indices]
        
        # 1. 提取该 List 对应的所有规则子集
        # 使用 isin 快速筛选
        subset = master_df[master_df[index_col].isin(indices)].copy()
        
        if subset.empty:
            print(f"Warning: List '{list_name}' 里的 Index 在原始表中都找不到，跳过。")
            continue
            
        # ------------------------------------------
        # 生成 表二 (Details) 的数据
        # ------------------------------------------
        # 我们需要保留原始信息，并加上 List Name
        subset_detail = subset.copy()
        subset_detail.insert(0, 'List_Name', list_name) # 第一列放 List 名字
        
        # 只保留关键列 (你可以根据需要调整这里保留的列)
        # 假设原始列名是这些，如果没有会忽略错误
        keep_cols = ['List_Name', index_col, 'Tree', 'Points', 'Point', 'DetailedSplit']
        existing_cols = [c for c in keep_cols if c in subset_detail.columns]
        details_data.append(subset_detail[existing_cols])
        
        # ------------------------------------------
        # 生成 表一 (Summary) 的数据
        # ------------------------------------------
        # A. 提取这一组所有规则涉及的指标
        all_indicators_in_subset = [] # 存储这一组里每一条规则用到的指标列表
        
        for idx, row in subset.iterrows():
            # 假设规则文本在 'DetailedSplit' 列
            rule_text = row.get('DetailedSplit', '')
            feats = extract_indicators(rule_text)
            all_indicators_in_subset.extend(feats)
        
        # B. 计算频率
        total_rules = len(subset)
        if total_rules > 0:
            from collections import Counter
            counts = Counter(all_indicators_in_subset)
            
            # Key Indicators: 出现频率 >= 90%
            key_indicators = [ind for ind, count in counts.items() if (count / total_rules) >= 0.9]
            
            # All Indicators: 出现过的所有指标 (去重并排序)
            unique_indicators = sorted(list(counts.keys()))
        else:
            key_indicators = []
            unique_indicators = []
            
        # C. 获取所有 Tree ID
        if 'Tree' in subset.columns:
            all_trees = sorted(subset['Tree'].unique().tolist())
        else:
            all_trees = []

        # D. 组装汇总行
        summary_data.append({
            'List_Name': list_name,
            'Key_Indicators (>=90%)': ", ".join(key_indicators),
            'All_Indicators': ", ".join(unique_indicators),
            'All_Trees': ", ".join(map(str, all_trees)),
            'All_Rules_Count': total_rules,
            'All_Rule_Indices': ", ".join(indices) # 如果列表太长，Excel里可能会显示不全
        })

    # ------------------------------------------
    # 合并结果
    # ------------------------------------------
    summary_df = pd.DataFrame(summary_data)
    
    if details_data:
        details_df = pd.concat(details_data, ignore_index=True)
    else:
        details_df = pd.DataFrame()
        
    return summary_df, details_df

# ==========================================
# 使用示例
# ==========================================

# 1. 读取你的原始数据 (假设就是之前的 tree.xlsx)
# 请确保你的 Excel 里有 'DetailedSplit' 和 'Index' (或你自己命名的索引列)
master_df = pd.read_excel('tree.xlsx', sheet_name='Sheet1')

# 2. 定义你的 Lists (这就是你的输入)
# 你可以手动写，也可以从之前的聚类结果中自动提取
my_rule_lists = {
    "Cluster_1": [1, 3, 6, 10],   # 示例数据，请替换为你真实的 Index
    "Cluster_2": [2, 4, 5, 7, 8],
    "High_Risk_Group": [201, 208, 209] 
}

# 3. 运行函数
# 注意：index_col='Index' 这里的 'Index' 必须是你 Excel 第一列的列名
# 如果你的第一列叫 'Rule_ID'，就改成 index_col='Rule_ID'
df_summary, df_details = generate_rule_tables(my_rule_lists, master_df, index_col='Index')

# 4. 保存结果
with pd.ExcelWriter("Rule_Analysis_Report.xlsx") as writer:
    df_summary.to_excel(writer, sheet_name='Summary_Table_1', index=False)
    df_details.to_excel(writer, sheet_name='Details_Table_2', index=False)

print("处理完成！结果已保存至 Rule_Analysis_Report.xlsx")
print("Sheet 1: 汇总对比 (Key Indicators, Trees...)")
print("Sheet 2: 规则详情 (List, Point, DetailedSplit...)")

In [None]:
def check_integrity(rule_lists_dict, master_df, index_col='Index'):
    """
    检查规则列表的完整性：不重复、不遗漏
    """
    print("\n" + "="*30)
    print("正在进行完整性检查 (MECE Check)...")
    
    # 1. 收集所有被分配的 Index
    all_assigned = []
    for indices in rule_lists_dict.values():
        all_assigned.extend([str(i) for i in indices])
    
    # 2. 获取原始表中所有的 Index
    all_master = set(master_df[index_col].astype(str).tolist())
    assigned_set = set(all_assigned)
    
    # 3. 检查重复 (Mutually Exclusive)
    counts = Counter(all_assigned)
    duplicates = [idx for idx, count in counts.items() if count > 1]
    
    # 4. 检查遗漏 (Collectively Exhaustive)
    missing = all_master - assigned_set
    
    # 5. 检查是否存在不存在于原始表的 Index (幻觉检查)
    extra = assigned_set - all_master

    # 输出结果
    passed = True
    if duplicates:
        print(f"❌ 错误：发现 {len(duplicates)} 个重复 Index: {duplicates}")
        passed = False
    if missing:
        print(f"❌ 错误：发现 {len(missing)} 个遗漏 Index (未包含在任何 list 中): {sorted(list(missing))}")
        passed = False
    if extra:
        print(f"⚠️ 警告：发现 {len(extra)} 个不存在于原始表的 Index: {extra}")
        passed = False
        
    if passed:
        print("✅ 检查通过：所有规则不重复、不遗漏，且全部有效。")
    print("="*30 + "\n")
    return passed

In [None]:
# 1. 读取原始数据
master_df = pd.read_excel('tree.xlsx', sheet_name='Sheet1')

# 2. 定义你的 Lists (LLM 给你的分类结果)
my_rule_lists = {
    "Cluster_1": [1, 3, 6, 10],   
    "Cluster_2": [2, 4, 5, 7, 8],
    "High_Risk_Group": [201, 208, 209] 
}

# 3. 【新增步骤】先运行检查
# 即使检查不通过，也可以继续生成表格，但你会看到报错提醒
check_integrity(my_rule_lists, master_df, index_col='Index')

# 4. 运行生成函数
df_summary, df_details = generate_rule_tables(my_rule_lists, master_df, index_col='Index')

# 5. 保存结果
with pd.ExcelWriter("Rule_Analysis_Report.xlsx") as writer:
    df_summary.to_excel(writer, sheet_name='Summary_Table_1', index=False)
    df_details.to_excel(writer, sheet_name='Details_Table_2', index=False)

print("处理完成！")

In [None]:
def get_single_cluster_prompt_data(cluster_label, summary_df, details_df):
    """
    Generate prompt data for a SINGLE cluster, including ALL its rules.
    """
    # 1. Get Summary Info
    summary_row = summary_df[summary_df['List_Name'] == cluster_label].iloc[0]
    avg_score = summary_row.get('Average_Point', 'N/A')
    key_inds = summary_row['Key_Indicators (>=90%)']
    all_inds = summary_row['All_Indicators']
    count = summary_row['All_Rules_Count']
    
    # 2. Get ALL Rules Details
    # Filter details_df for this specific cluster
    subset = details_df[details_df['List_Name'] == cluster_label]
    
    # Format the data string
    data_text = f"=== TARGET CLUSTER: {cluster_label} ===\n"
    data_text += f"Statistics:\n"
    data_text += f"- Rule Count: {count}\n"
    data_text += f"- Average Risk Score: {avg_score}\n"
    data_text += f"- Key Indicators (freq>=90%): {key_inds}\n"
    data_text += f"- All Indicators Involved: {all_inds}\n\n"
    
    data_text += f"=== ALL INCLUDED RULES ({count} rules) ===\n"
    # Iterate through all rules in this cluster
    for idx, row in subset.iterrows():
        # Clean up the DetailedSplit text
        rule_content = str(row['DetailedSplit']).replace('"', '').strip()
        point = row.get('Point', row.get('Points', 'N/A'))
        rule_idx = row.get('Index', 'N/A') # Assuming 'Index' column exists
        
        data_text += f"[Rule Index: {rule_idx} | Score: {point}]\n"
        data_text += f"Condition: {rule_content}\n"
        data_text += "-" * 20 + "\n"
        
    return data_text

# ==========================================
# 使用示例
# ==========================================
# 假设您想跑 Cluster 1
target_cluster = "Cluster_1"  # 请确保名字和 summary_df 里的一致
cluster_data_text = get_single_cluster_prompt_data(target_cluster, df_summary, df_details)

print(cluster_data_text)
# 复制打印出来的内容，粘贴到下面 Prompt 的 [DATA_SECTION] 处

In [None]:
# Role
You are a Senior Fraud Risk Expert with 20 years of experience in financial crime investigation and model interpretation. You specialize in translating technical XGBoost rules into clear, actionable business risk scenarios.

# Task
I have performed Hierarchical Clustering on a set of fraud detection rules.
I am providing you with **ALL the rules** belonging to a single specific cluster: **[CLUSTER_NAME]**.

Your goal is to analyze these rules collectively to define the **Risk Persona** of this cluster. You need to explain *what* specific fraud scenario this group of rules is catching.

# Context: Indicator Definitions
Use this dictionary to interpret the business meaning of the rules:
# Analysis Requirements
Please analyze the provided data and output a report covering:

1. **Scenario Name**: A professional, concise name for this risk cluster (e.g., "xxxxx").
2. **Risk Narrative**: A detailed explanation of the fraud pattern.
    - Connect the dots between the **Key Indicators**.
    - Explain the business logic: "Why do these specific indicators appear together?"
    - Example: "This cluster targets companies in the xxxxx"
3. **Core Pattern**: The primary formula of this cluster (e.g., "Pre-condition A + Over-leverage B + Asset Inflation C").
4. **Consistency Check**: Are there any rules in this list that seem to define a slightly different logic? If yes, briefly mention them as "Variants".

# Input Data
[DATA_SECTION_START]
(Paste the Python output here)
[DATA_SECTION_END]

# Output Format
Please provide the response in a structured Markdown format.

In [None]:
# Role
You are a Senior Fraud Risk Analyst. Your task is to interpret a specific fraud detection rule generated by an XGBoost model and translate it into a **single, concise business description**.

# Context: Indicator Dictionary
Use the following definitions to interpret the conditions:

# Task Requirements
1. **Analyze**: Look at the combination of indicators in the provided rule.
2. **Synthesize**: Write **ONE sentence** describing what a company triggering this rule looks like from a business perspective.

# Output Format
Return **ONLY** the description sentence. Do not include "Here is the description" or quotes.

# Input Rule
[RULE_INFO]

In [None]:
import pandas as pd
import time

# ==========================================
# 1. 定义 Prompt 模板
# ==========================================
PROMPT_TEMPLATE = """
# Role
You are a Senior Fraud Risk Analyst. Your task is to interpret a specific fraud detection rule generated by an XGBoost model and translate it into a **single, concise business description**.

# Context: Indicator Dictionary
(Use the table provided above in your actual prompt string...)

# Task Requirements
1. Write **ONE sentence** describing the business logic of this rule.
2. Focus on the *combination* of risks (e.g., "").
3. Mention the specific risk type (e.g., "").

# Output Format
Return ONLY the description sentence.

# Input Rule
Index: {index}
Score: {point}
Conditions: {detailed_split}
"""

# ==========================================
# 2. 模拟 LLM 调用函数 (请替换为真实 API)
# ==========================================
def call_llm_api(prompt_text):
    """
    这里是模拟函数。
    实际使用时，请在这里调用 openai.ChatCompletion.create(...) 
    或者你的公司内部 LLM 接口。
    """
    # 模拟返回：假装 LLM 已经理解了
    # 在真实环境，这里应该是: return response['choices'][0]['message']['content']
    return "This is a simulated description from LLM." 

# ==========================================
# 3. 主流程
# ==========================================
def process_rules_one_by_one(input_file, output_file):
    # 读取 Excel
    df = pd.read_excel(input_file, sheet_name='Sheet1')
    
    # 准备结果列表
    results = []
    
    print(f"开始处理 {len(df)} 条规则...")
    
    for idx, row in df.iterrows():
        rule_idx = row.get('Index', idx) # 获取 Rule Index
        point = row.get('Points', row.get('Point', 'N/A'))
        split = str(row['DetailedSplit']).replace('"', '').strip()
        
        # 1. 构造 Prompt
        # 注意：这里需要把完整的字典字符串放进 template
        # 为了演示，我只展示替换变量的部分
        current_prompt = PROMPT_TEMPLATE.format(
            index=rule_idx,
            point=point,
            detailed_split=split
        )
        
        # 2. 调用 LLM (如果有一条报错，用 try-except 捕获，不要中断整个程序)
        try:
            # === 如果你有 API，取消下面这行的注释 ===
            # description = call_llm_api(current_prompt)
            
            # === 如果你是手动做，这里只是生成 Prompt 给你看 ===
            description = "PENDING_LLM_RESPONSE" 
            # print(f"Processing Rule {rule_idx}...")
            
        except Exception as e:
            print(f"Error processing Rule {rule_idx}: {e}")
            description = "ERROR"
        
        # 3. 收集结果
        results.append({
            'Rule_Index': rule_idx,
            'Points': point,
            'DetailedSplit': split,
            'LLM_Description': description, # LLM 生成的一句话描述
            'Full_Prompt': current_prompt   # (可选) 保存生成的 Prompt 方便调试
        })
        
        # (可选) 增加延时，防止 API Rate Limit
        # time.sleep(0.5)

    # 4. 保存结果
    result_df = pd.DataFrame(results)
    result_df.to_excel(output_file, index=False)
    print(f"处理完成！结果已保存至 {output_file}")

# ==========================================
# 运行
# ==========================================
# 假设你的文件叫 tree.xlsx
# process_rules_one_by_one('tree.xlsx', 'Rule_Descriptions.xlsx')