# 自定义选择器


In [None]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

from factorflow import SelectFprKS, SelectFromModelShapCV, check_x_shape


In [None]:
# 1. 准备数据
n_features = 10_000
X, y = make_classification(n_samples=1000, n_features=n_features, n_informative=5, n_redundant=100, random_state=42)
X = pd.DataFrame(X, columns=pd.Index([f"feat_{i}" for i in range(n_features)]))

In [None]:
# 2. 自定义选择器
class MySelector(BaseEstimator, TransformerMixin):  # noqa: D101
    def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> "MySelector":
        """Learn which columns to keep."""
        self.feature_importances_ = X.sum(axis=0)
        self.mask_ = self.feature_importances_ > 0
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Select the columns to keep."""
        return X.loc[:, self.mask_]


In [None]:
# 3. 搭建 Pipeline
estimator = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
fe_pipeline = make_pipeline(
    check_x_shape("before my selector"),  # 使用额外的工具打印X的形状
    MySelector(),
    check_x_shape("after my selector"),
    SelectFprKS(0.05).check_selection(),
    SelectFromModelShapCV(
        estimator=estimator,
        task_type="classification",
        n_features_to_select=50,
    ).check_selection(),
)

In [None]:
# 4. 拟合并转换
fe_pipeline.fit_transform(X, y)