In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# 1. 读取数据
df = pd.read_excel('tree.xlsx', sheet_name='Sheet1')

# 2. 文本预处理函数：只保留 "ind_xxx" 这样的特征名
# 例如将 "ind_4e < 1 or missing" 转化为 "ind_4e"
def preprocess_rule(text):
    import re
    # 正则表达式匹配以 ind 开头的特征名
    features = re.findall(r"(ind[\w_]+)", str(text))
    return " ".join(features)

# 3. 应用预处理
df['features_text'] = df['DetailedSplit'].apply(preprocess_rule)

# 4. 构建矩阵 (One-Hot Encoding)
# binary=True 表示我们只关心“有没有用到这个特征”，不关心用了几次
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(df['features_text'])

# 5. 转化为 DataFrame 查看 (这就是“规则-特征”矩阵)
rule_feature_matrix = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# 将原始的 Point（分数）拼回来，方便对照
rule_feature_matrix['Score_Points'] = df['Point']

print(rule_feature_matrix.head())