In [1]:
import sys
sys.path.append('..')
import numpy as np, pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from skopt.space import Real, Integer, Categorical
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from nestedcvtraining.api import find_best_model
from nestedcvtraining.switch_case import SwitchCase
from nestedcvtraining.under_sampling_classifier import UnderSamplingClassifier
from skopt import gbrt_minimize
from sklearn.metrics import log_loss

In [6]:
from sklearn.datasets import make_classification
X, y = make_classification(
    n_samples=40,
    n_features=20, 
    n_redundant=5, 
    n_informative=5, 
    n_classes=2, 
    n_clusters_per_class=3,
    flip_y=0.05,
    class_sep=0.7,
    weights=[0.8, 0.2],
    random_state=42
)

In [7]:
from collections import Counter
Counter(y)

Counter({1: 9, 0: 31})

In [4]:
resampler = SwitchCase(
    cases=[
        (
            "resampler_1",
            SMOTE(k_neighbors=3)
        ),
        (
            "resampler_2",
            "passthrough"
        )
    ],
    switch="resampler_1"
)

preprocessor = SwitchCase(
    cases=[
        (
            "prep_1",
            SkPipeline([
                ("scale", StandardScaler()), 
                ("reduce_dims", PCA(n_components=5))
            ])
        ),
        (
            "prep_2",
            SkPipeline([
                ("scale", StandardScaler()), 
                ("reduce_dims", SelectKBest(mutual_info_classif, k=5)),
            ])
        ),
        (
            "prep_3",
            "passthrough"
        )
    ],
    switch="prep_1"
)

model = SwitchCase(
    cases=[
        (
            "model_1",
            LogisticRegression()
        ),
        (
            "model_2",
            RandomForestClassifier()
        )
    ],
    switch="model_1"
)

clf = ImbPipeline(
    [("resampler", resampler), ("preprocessor", preprocessor), ("model", model)]
)

search_space= [
    Categorical(["resampler_1", "resampler_2"], name="resampler__switch"),
    Categorical(["prep_1", "prep_2", "prep_3"], name="preprocessor__switch"),
    Categorical(["model_1", "model_2"], name="model__switch"),
    Categorical(["minority", "all"], name="resampler__resampler_1__sampling_strategy"),
    Integer(5, 15, name="model__model_2__max_depth")
]


In [8]:
best_model, best_params, report = find_best_model(
    X=X,
    y=y,
    model=clf,
    search_space=search_space,
    verbose=False,
    k_inner=39,
    k_outer=40,
    skip_outer_folds=list(range(0, 40, 2)),
    skip_inner_folds=list(range(0, 39, 2)),
    n_initial_points=5,
    n_calls=5,
    calibrate="only_best",
    calibrate_params={"method": "isotonic"},
    optimizing_metric=make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1]),
    other_metrics={"acc": "accuracy"},
    skopt_func=gbrt_minimize
)

Looping over 1 outer fold
Looping over 3 outer fold
Looping over 5 outer fold
Looping over 7 outer fold
Looping over 9 outer fold
Looping over 11 outer fold
Looping over 13 outer fold
Looping over 15 outer fold
Looping over 17 outer fold
Looping over 19 outer fold
Looping over 21 outer fold
Looping over 23 outer fold
Looping over 25 outer fold
Looping over 27 outer fold
Looping over 29 outer fold
Looping over 31 outer fold
Looping over 33 outer fold
Looping over 35 outer fold
Looping over 37 outer fold
Looping over 39 outer fold


In [9]:
report.get_outer_metrics_report()

{'acc': {'mean': 0.7, 'sd': 0.45825756949558394, 'min': 0.0, 'max': 1.0},
 'optimizing_metric': {'mean': -0.7358888637090166,
  'sd': 0.7540255045514256,
  'min': -2.4849066497880004,
  'max': -0.04144985384594088}}

In [14]:
# Let's print the predicted probability of the true class for each example that was left out during the outer loop 
for model, test_idxs in report.iter_models_test_idxs():
    y_test_proba = model.predict_proba(X[test_idxs])
    y_true = y[test_idxs]
    print(y_test_proba[0, y_true[0]])

0.605189393939394
0.12095238095238096
0.8428571428571429
0.71
0.6200000000000001
0.9099999999999999
0.5814868111339213
0.37
0.8166666666666667
0.13999999999999999
0.95
0.6785185185185185
0.9199999999999999
0.5726018588421768
0.3371428571428571
0.14666666666666667
0.08333333333333333
0.9319607843137254
0.5333333333333334
0.9593974442272666
