# 使用 Pipeline 搭建复杂流程, 使用流式 API


In [None]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

from factorflow import SelectCollinearity, SelectFprKS, SelectFromModelShapCV


In [None]:
# 1. 准备数据
n_features = 10_000
X, y = make_classification(n_samples=1000, n_features=n_features, n_informative=5, n_redundant=100, random_state=42)
X = pd.DataFrame(X, columns=pd.Index([f"feat_{i}" for i in range(n_features)]))

In [None]:
# 3. 搭建 Pipeline
estimator = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
fe_pipeline = make_pipeline(
    # 筛选与y进行KS检验后显著性水平小于0.05的特征, 并保护feat_1不被过滤掉:
    SelectFprKS(0.05).check_selection().protect_features("feat_1"),
    # 移除相关性高于0.9的共线性特征, 并检查feat_1*是否被过滤掉:
    SelectCollinearity(threshold=0.9).check_selection().check_features("feat_1*"),
    # 最终使用SHAP值进行特征重要性排序, 并选择重要性最高的50个特征:
    SelectFromModelShapCV(
        estimator=estimator,
        task_type="classification",
        n_features_to_select=50,
    ).check_selection(),
)

In [None]:
# 4. 拟合并转换
fe_pipeline.fit_transform(X, y)