In [1]:
import numpy as np
import pandas as pd
from river import optim
from river import metrics
from river.drift.binary import EDDM
from sail.telemetry import TracingClient
from sail.models.auto_ml.tune import SAILTuneGridSearchCV
from sail.models.river.forest import AdaptiveRandomForestRegressor
from sail.models.river.linear_model import LinearRegression
from sail.models.auto_ml.auto_pipeline import SAILAutoPipeline
from sail.pipeline import SAILPipeline
from sklearn.impute import SimpleImputer
from sail.models.torch.rnn import RNNRegressor
from sail.drift_detection.drift_detector import SAILDriftDetector
from sail.transformers.river.preprocessing import StandardScaler

#### Load Data


In [2]:
X = pd.read_csv("../../datasets/HDWF2.csv")

y = X["power"]
X.drop(["power", "time"], axis=1, inplace=True)

#### Model Definition


In [3]:
linear_reg = LinearRegression(optimizer=optim.SGD(0.1))
random_forest = AdaptiveRandomForestRegressor()
learner_gru = RNNRegressor(
    input_units=12,
    output_units=1,
    hidden_units=100,
    n_hidden_layers=3,
    lr=0.001,
    cell_type="GRU",
    verbose=0,
)

#### Create SAIL Pipeline


In [4]:
steps = [
    ("Imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("standard_scalar", StandardScaler()),
    ("regressor", "passthrough"),
]
sail_pipeline = SAILPipeline(
    steps=steps, scoring=[metrics.R2], verbosity_level=1, verbosity_interval=2
)

#### HPO Params


In [5]:
params_grid = [
    {
        "regressor": [linear_reg],
        "regressor__l2": [0.1, 0.9],
        "regressor__intercept_init": [0.2, 0.5],
    },
    {"regressor": [random_forest], "regressor__n_models": [10, 15, 20]},
    {
        "regressor": [learner_gru],
        "regressor__input_units": [12],
        "regressor__output_units": [1],
        "regressor__hidden_units": [50, 100],
    },
]

#### Create Model Instance


In [6]:
# tracer = TracingClient(service_name="SAIL_Testing", otlp_endpoint="http://localhost:4318")

auto_pipeline = SAILAutoPipeline(
    pipeline=sail_pipeline,
    pipeline_params_grid=params_grid,
    search_method=SAILTuneGridSearchCV,
    search_method_params={
        "num_cpus_per_trial": 1,
        "max_iters": 1,
        "early_stopping": False,
        "mode": "max",
        "scoring": "r2",
        "pipeline_auto_early_stop": False,
        "keep_best_configurations": 2,
        # "runtime_env": {
        #     "py_modules": [sail],
        #     "pip": [
        #         "scikit-learn>=1.2",
        #         "logzero",
        #         "numpy>=1.23",
        #         "river==0.14.*",
        #         "ray>=2.5",
        #         "tune_sklearn",
        #         "dill",
        #         "ipython",
        #     ],
        # },
        # "namespace": "sail-tune",
        # "cluster_address": "ray://localhost:10001"
    },
    search_data_size=500,
    incremental_training=True,
    drift_detector=SAILDriftDetector(model=EDDM(), drift_param="difference"),
    pipeline_strategy="DetectAndIncrement",
    verbosity_level=1,
    verbosity_interval=2,
    tensorboard_log_dir=None,
    tracer=None,
)

[2023-10-08 22:39:53:274] - INFO - SAIL (PipelineStrategy) - Pipeline Strategy [DetectAndIncrement] created with actions: ['DATA_COLLECTION', 'FIND_BEST_PIPELINE', 'SCORE_AND_DETECT_DRIFT', 'PARTIAL_FIT_MODEL']


#### Start Training


In [7]:
y_preds = []
y_true = []
batch_size = 50

In [8]:
batch_size = 50
# with tracer.trace_as_current_span(span_name="PIPELINE_TRAIN"):
for start in range(0, 1000, batch_size):
    end = start + batch_size

    X_train = X.iloc[start:end]
    y_train = y.iloc[start:end]

    if end > 500:  # search_data_size is 500
        preds = auto_pipeline.predict(X_train)
        y_preds.extend(list(preds))
        y_true.extend(list(y_train))

    auto_pipeline.train(X_train, y_train)

0,1
Current time:,2023-10-08 22:40:11
Running for:,00:00:11.98
Memory:,21.7/64.0 GiB

Trial name,status,loc,regressor,regressor__hidden_un its,regressor__input_uni ts,regressor__intercept _init,regressor__l2,regressor__n_models,regressor__output_un its,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_e51f284f,TERMINATED,127.0.0.1:45670,LinearRegressio_8250,,,0.2,0.1,,,1,0.0755241,-4.62577e+19,-3.78536e+19,-3.36403e+19
_Trainable_11cad363,TERMINATED,127.0.0.1:45670,LinearRegressio_70a0,,,0.2,0.9,,,1,0.0749912,-1.49113e+19,-3.12472e+16,-3.01217e+19
_Trainable_ecdf7df2,TERMINATED,127.0.0.1:45670,LinearRegressio_1d80,,,0.5,0.1,,,1,0.0758247,-8.73394e+19,-4.91866e+18,-2.5045e+19
_Trainable_cb999f6e,TERMINATED,127.0.0.1:45670,LinearRegressio_0c10,,,0.5,0.9,,,1,0.0812669,-1.50236e+19,-1.17498e+17,-4.56942e+19
_Trainable_65628954,TERMINATED,127.0.0.1:45670,AdaptiveRandomF_1030,,,,,10.0,,1,3.47091,0.642463,0.0230324,-0.216408
_Trainable_3b2f71c9,TERMINATED,127.0.0.1:45682,AdaptiveRandomF_4fd0,,,,,15.0,,1,4.18859,0.558198,-0.371116,-1.28376
_Trainable_ff8d0585,TERMINATED,127.0.0.1:45670,AdaptiveRandomF_3fa0,,,,,20.0,,1,5.75815,0.505694,-0.315212,-0.262942
_Trainable_53718db3,TERMINATED,127.0.0.1:45701,<class 'sail.mo_33a0,50.0,12.0,,,,1.0,1,0.0882928,-2.39591,-7.31047,-0.62591
_Trainable_acd176f5,TERMINATED,127.0.0.1:45701,<class 'sail.mo_22c0,100.0,12.0,,,,1.0,1,0.0868728,-2.39364,-7.28983,-0.62827


[2023-10-08 22:40:13:882] - INFO - SAIL (PipelineStrategy) - Pipeline tuning completed. Disconnecting Ray cluster...
[2023-10-08 22:40:13:883] - INFO - SAIL (PipelineStrategy) - Found best params: {'regressor': AdaptiveRandomForestRegressor(drift_detector=ADWIN (
      delta=0.001
      clock=32
      max_buckets=5
      min_window_length=5
      grace_period=10
    ),
                                  metric=MSE: 0.,
      delta=0.01
      clock=32
      max_buckets=5
      min_window_length=5
      grace_period=10
    )), 'regressor__n_models': 10}

    
>> Epoch: 12 | Samples Seen: 550 -------------------------------------------------------------------------------------

    
>> Epoch: 14 | Samples Seen: 650 -------------------------------------------------------------------------------------

    
>> Epoch: 16 | Samples Seen: 750 -------------------------------------------------------------------------------------

    
>> Epoch: 18 | Samples Seen: 850 -----------------------------

#### Save Model


In [9]:
auto_pipeline.save_model(".")

[2023-10-08 21:42:23:589] - INFO - SAIL (SAILModel) - Model saved successfully.


'./sail_auto_pipeline'

#### Load Model


In [10]:
new_auto_pipeline = SAILAutoPipeline.load_model(".")

[2023-10-08 21:42:25:404] - INFO - SAIL (PipelineStrategy) - Pipeline Strategy [DetectAndIncrement] created with actions: ['DATA_COLLECTION', 'FIND_BEST_PIPELINE', 'SCORE_AND_DETECT_DRIFT', 'PARTIAL_FIT_MODEL']
[2023-10-08 21:42:25:566] - INFO - SAIL (SAILModel) - Model loaded successfully.


#### Continue training using the load model


In [11]:
for start in range(1501, 2001, batch_size):
    end = start + batch_size

    X_train = X.iloc[start:end]
    y_train = y.iloc[start:end]

    preds = new_auto_pipeline.predict(X_train)
    y_preds.extend(list(preds))
    y_true.extend(list(y_train))

    new_auto_pipeline.train(X_train, y_train)


>>> Epoch: 22 | Samples Seen: 1050 -------------------------------------------------------------------------------------





>>> Epoch: 24 | Samples Seen: 1150 -------------------------------------------------------------------------------------





>>> Epoch: 26 | Samples Seen: 1250 -------------------------------------------------------------------------------------





>>> Epoch: 28 | Samples Seen: 1350 -------------------------------------------------------------------------------------





>>> Epoch: 30 | Samples Seen: 1450 -------------------------------------------------------------------------------------




#### Final Score


In [12]:
new_auto_pipeline.progressive_score

0.7730706991771359

### Plot predictions


In [13]:
import plotly.express as px

df = pd.DataFrame({"y_true": y_true, "y_preds": y_preds}).head(500)
fig = px.line(df, y=["y_true", "y_preds"], title="")
fig.show()