像 Keras 一样使用Scikit-Learn：图式构造复杂 Scikit-Learn pipeline 的函数式API
A graph-based functional API for building complex scikit-learn pipelines.
https://github.com/alegonz/baikal

In [29]:
import sklearn.svm
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from baikal import make_step, Input, Model
from baikal.sklearn import SKLearnWrapper

In [2]:
# 1. Define a step
SVC = make_step(sklearn.svm.SVC)

In [17]:
# 2. Build the model
x = Input()
y_t = Input()
y = SVC(C=1.0, kernel="rbf", gamma=0.5)(x, y_t)
model = Model(x, y, y_t)

In [104]:
# 3. Train the model
dataset = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, random_state=0
)
model.fit(X_train, y_train)

Model(name='Model_1', function='predict', n_outputs=1, trainable=True)

In [68]:
# 4. Use the model
y_test_pred = model.predict(X_test)
y_test_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [20]:
accuracy_score(y_test, y_test_pred)

0.6293706293706294

In [9]:
# Plotting your model
from baikal.plot import plot_model
plot_model(model, filename="baikal_model.png")

In [88]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [108]:
iris = sklearn.datasets.load_iris()
# x_data = iris.data
# y_data = iris.target
random_state = 123
verbose = 0

In [109]:
# cv will default to KFold if the estimator is a baikal Model
# so we have to pass StratifiedKFold directly
cv = StratifiedKFold(n_splits=3, random_state=random_state)

In [110]:
# Define the steps
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

BaiLogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
BaiPCA = make_step(sklearn.decomposition.PCA)
BaiRandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)

In [111]:
# 1. Define a function that returns your baikal model
def build_fn():
    x = Input()
    y_t = Input()
    h = BaiPCA(random_state=random_state,name="pca")(x)
    y = BaiLogisticRegression(random_state=random_state,name="classifier")(h, y_t)
    model = Model(x, y, y_t)
    return model

In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# 2. Define a parameter grid
# - keys have the [step-name]__[parameter-name] format, similar to sklearn Pipelines
# - You can also search over the steps themselves using [step-name] keys
param_grid = [
    {
        "classifier": [
            BaiLogisticRegression(
                random_state=random_state, solver="lbfgs", multi_class="multinomial"
            )
        ],
        "classifier__C": [0.01, 0.1, 1],
        "pca__n_components": [1, 2, 3, 4],
    },
    {
        "classifier": [BaiRandomForestClassifier(random_state=random_state)],
        "classifier__n_estimators": [10, 50, 100],
        "pca__n_components": [1, 2, 3, 4],
    },
]

In [113]:
# 3. Instantiate the wrapper
sk_model = SKLearnWrapper(build_fn)

In [114]:
# 4. Use GridSearchCV as usual
gscv_baikal = GridSearchCV(
    sk_model,
    param_grid,
    cv=cv,
    scoring="accuracy",
    return_train_score=True,
    verbose=verbose,
)
gscv_baikal.fit(X_train, y_train)
best_model = gscv_baikal.best_estimator_.model

In [118]:
best_model.get_params()

{'classifier': LogisticRegression(name='classifier', function='predict', n_outputs=1, trainable=True,
 C=0.01, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='multinomial', n_jobs=None, penalty='l2', random_state=123, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False),
 'classifier__C': 0.01,
 'classifier__class_weight': None,
 'classifier__dual': False,
 'classifier__fit_intercept': True,
 'classifier__intercept_scaling': 1,
 'classifier__l1_ratio': None,
 'classifier__max_iter': 100,
 'classifier__multi_class': 'multinomial',
 'classifier__n_jobs': None,
 'classifier__penalty': 'l2',
 'classifier__random_state': 123,
 'classifier__solver': 'lbfgs',
 'classifier__tol': 0.0001,
 'classifier__verbose': 0,
 'classifier__warm_start': False,
 'pca': PCA(name='pca', function='transform', n_outputs=1, trainable=True,
 copy=True, iterated_power='auto', n_components=4, random_state=123, svd_solver='auto', tol=0.0, whiten=F

In [115]:
y_pred = best_model.predict(X_test)

In [116]:
accuracy_score(y_test, y_test_pred)

0.6293706293706294