In [1]:
import numpy as np
import pandas as pd
import river
import sys
from river import optim
from river import metrics
from river.drift import ADWIN
from ray.tune.search import BasicVariantGenerator
from sail.models.auto_ml.tune import SAILTuneGridSearchCV
from sail.models.river.forest import AdaptiveRandomForestRegressor
from sail.models.river.linear_model import LinearRegression
from sail.models.auto_ml.auto_pipeline import SAILAutoPipeline
from sail.pipeline import SAILPipeline
from sklearn.impute import SimpleImputer
import ray.cloudpickle as cpickle
from sail.transfomers.river.preprocessing import StandardScaler

#### Load Data


In [2]:
X = pd.read_csv("../../datasets/HDWF2.csv")

y = X["power"]
X.drop(["power", "time"], axis=1, inplace=True)

#### Model Definition


In [3]:
linear_reg = LinearRegression(optimizer=optim.SGD(0.1))
random_forest = AdaptiveRandomForestRegressor()

#### Create SAIL Pipeline


In [4]:
steps = [
    ("Imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("standard_scalar", StandardScaler()),
    ("regressor", "passthrough"),
]
sail_pipeline = SAILPipeline(steps=steps)

[2023-06-26 16:49:58:346] - INFO - SAIL : created SAILPipeline object with ID 611751a6-63de-4a2f-a2a9-dc66431ea043


#### HPO Params


In [5]:
params_grid = [
    {
        "regressor": [linear_reg],
        "regressor__l2": [0.1, 0.9],
        "regressor__intercept_init": [0.2, 0.5],
    },
    {
        "regressor": [random_forest],
        "regressor__n_models": [10, 15, 20]
    },
]

#### Create Model Instance


In [6]:
auto_pipeline = SAILAutoPipeline(
    pipeline=sail_pipeline,
    pipeline_params_grid=params_grid,
    search_method=SAILTuneGridSearchCV,
    search_method_params={
        "max_iters": 1,
        "early_stopping": False,
        "mode": "max",
        "scoring": "r2",
        "pipeline_auto_early_stop": False,
        "keep_best_configurations": 2
    },
    search_data_size=1000,
    incremental_training=True,
    scoring=metrics.R2,
    drift_detector=ADWIN(delta=0.001),
    pipeline_strategy="DetectAndIncrement",
)

[2023-06-26 16:49:58:377] - INFO - SAIL : Pipeline Strategy [DetectAndIncrement] created with actions: ['DATA_COLLECTION', 'FIND_BEST_PIPELINE', 'SCORE_AND_DETECT_DRIFT', 'PARTIAL_FIT_MODEL']


#### Start Training


In [7]:
y_preds = []
y_true = []
batch_size = 50

start = 0
for end in range(50, 1501, batch_size):

    X_train = X.iloc[start:end]
    y_train = y.iloc[start:end]

    if end > 1000: # search_data_size is 1000
        preds = auto_pipeline.predict(X_train)
        y_preds.extend(list(preds))
        y_true.extend(list(y_train))

    auto_pipeline.train(X_train, y_train)
    start = end


[2m[36m(_Trainable pid=22299)[0m [2023-06-26 16:50:04:259] - INFO - SAIL : created SAILPipeline object with ID 880f9efd-502d-4979-aef4-f087c4c8ea32
[2m[36m(_Trainable pid=22299)[0m [2023-06-26 16:50:04:259] - INFO - SAIL : created SAILPipeline object with ID 1e1d2196-fce1-4a8b-be50-c3c7e0ba6b8c
[2m[36m(_Trainable pid=22299)[0m [2023-06-26 16:50:04:259] - INFO - SAIL : created SAILPipeline object with ID b462f5c8-5387-4aae-858f-7c320d00a884
[2m[36m(_Trainable pid=22299)[0m [2023-06-26 16:50:04:259] - INFO - SAIL : created SAILPipeline object with ID 6ad0806e-f7ff-453d-9424-0baedeb3e4a6
[2m[36m(_Trainable pid=22299)[0m [2023-06-26 16:50:04:260] - INFO - SAIL : created SAILPipeline object with ID 4aad5f87-9a1c-400b-9ab7-c2b7a6b3c708
[2m[36m(_Trainable pid=22299)[0m [2023-06-26 16:50:04:270] - INFO - SAIL : created SAILPipeline object with ID 704d2143-1058-4b3e-b091-26a7bbba5a84
[2m[36m(_Trainable pid=22299)[0m [2023-06-26 16:50:04:300] - INFO - SAIL : created SAILPipe

[2m[36m(_Trainable pid=22299)[0m [Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.0s


2023-06-26 16:50:04,612	INFO tensorboardx.py:269 -- Removed the following hyperparameter values when logging to tensorboard: {'regressor': LinearRegression(initializer=Zeros (),
                 intercept_lr=Constant({'learning_rate': 0.01}),
                 loss=Squared({}),
                 optimizer=SGD({'lr': Constant({'learning_rate': 0.1}), 'n_iterations': 0}))}
2023-06-26 16:50:04,627	INFO tensorboardx.py:269 -- Removed the following hyperparameter values when logging to tensorboard: {'regressor': LinearRegression(initializer=Zeros (),
                 intercept_lr=Constant({'learning_rate': 0.01}),
                 loss=Squared({}),
                 optimizer=SGD({'lr': Constant({'learning_rate': 0.1}), 'n_iterations': 0}))}
[2m[36m(_Trainable pid=22304)[0m [2023-06-26 16:50:09:654] - INFO - SAIL : created SAILPipeline object with ID 834342a9-6c57-401b-a95a-311441e77ed9[32m [repeated 60x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log

[2m[36m(_Trainable pid=22304)[0m [Pipeline] ......... (step 3 of 3) Processing regressor, total=   1.7s[32m [repeated 28x across cluster][0m


2023-06-26 16:50:10,305	INFO tensorboardx.py:269 -- Removed the following hyperparameter values when logging to tensorboard: {'regressor': AdaptiveRandomForestRegressor()}
2023-06-26 16:50:13,125	INFO tensorboardx.py:269 -- Removed the following hyperparameter values when logging to tensorboard: {'regressor': AdaptiveRandomForestRegressor()}
2023-06-26 16:50:16,348	INFO tensorboardx.py:269 -- Removed the following hyperparameter values when logging to tensorboard: {'regressor': AdaptiveRandomForestRegressor()}
[2023-06-26 16:50:16:380] - INFO - SAIL : created SAILPipeline object with ID 03719cee-db6e-4a25-b1f5-a1d1df17d193
[2023-06-26 16:50:16:381] - INFO - SAIL : created SAILPipeline object with ID 9122d076-aff1-45eb-9c96-c0a524e9236e


[2m[36m(_Trainable pid=22308)[0m [Pipeline] ......... (step 3 of 3) Processing regressor, total=   2.3s[32m [repeated 6x across cluster][0m


[2023-06-26 16:50:18:658] - INFO - SAIL : Cumulative Pipeline Score: 0.6661212935827295
[2023-06-26 16:50:18:658] - INFO - SAIL : Partially fitting best pipeline.
[2023-06-26 16:50:18:659] - INFO - SAIL : Calling Partial_fit() on the pipeline.
[2023-06-26 16:50:18:797] - INFO - SAIL : Cumulative Pipeline Score: 0.7813449073461386
[2023-06-26 16:50:18:798] - INFO - SAIL : Partially fitting best pipeline.
[2023-06-26 16:50:18:798] - INFO - SAIL : Calling Partial_fit() on the pipeline.


[Pipeline] ......... (step 3 of 3) Processing regressor, total=   2.2s
[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.1s


[2023-06-26 16:50:18:937] - INFO - SAIL : Cumulative Pipeline Score: 0.6977189933339706
[2023-06-26 16:50:18:938] - INFO - SAIL : Partially fitting best pipeline.
[2023-06-26 16:50:18:938] - INFO - SAIL : Calling Partial_fit() on the pipeline.


[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.1s
[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.2s


[2023-06-26 16:50:19:163] - INFO - SAIL : Cumulative Pipeline Score: 0.7161222105301392
[2023-06-26 16:50:19:163] - INFO - SAIL : Partially fitting best pipeline.
[2023-06-26 16:50:19:164] - INFO - SAIL : Calling Partial_fit() on the pipeline.
[2023-06-26 16:50:19:308] - INFO - SAIL : Cumulative Pipeline Score: 0.7611145580642918
[2023-06-26 16:50:19:309] - INFO - SAIL : Partially fitting best pipeline.
[2023-06-26 16:50:19:309] - INFO - SAIL : Calling Partial_fit() on the pipeline.
[2023-06-26 16:50:19:452] - INFO - SAIL : Cumulative Pipeline Score: 0.6826223109793498
[2023-06-26 16:50:19:453] - INFO - SAIL : Partially fitting best pipeline.
[2023-06-26 16:50:19:453] - INFO - SAIL : Calling Partial_fit() on the pipeline.


[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.1s
[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.1s


[2023-06-26 16:50:19:594] - INFO - SAIL : Cumulative Pipeline Score: 0.6803113522683415
[2023-06-26 16:50:19:594] - INFO - SAIL : Partially fitting best pipeline.
[2023-06-26 16:50:19:595] - INFO - SAIL : Calling Partial_fit() on the pipeline.
[2023-06-26 16:50:19:737] - INFO - SAIL : Cumulative Pipeline Score: 0.655107327496291
[2023-06-26 16:50:19:737] - INFO - SAIL : Partially fitting best pipeline.
[2023-06-26 16:50:19:737] - INFO - SAIL : Calling Partial_fit() on the pipeline.


[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.1s
[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.1s


[2023-06-26 16:50:19:882] - INFO - SAIL : Cumulative Pipeline Score: 0.689400530911422
[2023-06-26 16:50:19:883] - INFO - SAIL : Partially fitting best pipeline.
[2023-06-26 16:50:19:883] - INFO - SAIL : Calling Partial_fit() on the pipeline.
[2023-06-26 16:50:20:26] - INFO - SAIL : Cumulative Pipeline Score: 0.6986844269368895
[2023-06-26 16:50:20:26] - INFO - SAIL : Partially fitting best pipeline.
[2023-06-26 16:50:20:27] - INFO - SAIL : Calling Partial_fit() on the pipeline.


[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.1s
[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.1s
[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.1s


#### Final Score


In [8]:
auto_pipeline.cumulative_score

0.6986844269368895

### Plot predictions


In [12]:
import plotly.express as px

df = pd.DataFrame({"y_true": y_true, "y_preds": y_preds}).head(500)
fig = px.line(df, y=["y_true", "y_preds"], title='')
fig.show()