In [1]:
import numpy as np
import pandas as pd
from river import optim
from river import metrics
from river.drift import EDDM
from sail.models.auto_ml.tune import SAILTuneGridSearchCV
from sail.models.river.forest import AdaptiveRandomForestRegressor
from sail.models.river.linear_model import LinearRegression
from sail.models.auto_ml.auto_pipeline import SAILAutoPipeline
from sail.pipeline import SAILPipeline
from sklearn.impute import SimpleImputer
from sail.drift_detection.drift_detector import SAILDriftDetector
from sail.transfomers.river.preprocessing import StandardScaler

#### Load Data


In [2]:
X = pd.read_csv("../../datasets/HDWF2.csv")

y = X["power"]
X.drop(["power", "time"], axis=1, inplace=True)

#### Model Definition


In [3]:
linear_reg = LinearRegression(optimizer=optim.SGD(0.1))
random_forest = AdaptiveRandomForestRegressor()

#### Create SAIL Pipeline


In [4]:
steps = [
    ("Imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("standard_scalar", StandardScaler()),
    ("regressor", "passthrough"),
]
sail_pipeline = SAILPipeline(steps=steps, scoring=metrics.R2)

#### HPO Params


In [5]:
params_grid = [
    {
        "regressor": [linear_reg],
        "regressor__l2": [0.1, 0.9],
        "regressor__intercept_init": [0.2, 0.5],
    },
    {
        "regressor": [random_forest],
        "regressor__n_models": [10, 15, 20]
    },
]

#### Create Model Instance


In [6]:
auto_pipeline = SAILAutoPipeline(
    pipeline=sail_pipeline,
    pipeline_params_grid=params_grid,
    search_method=SAILTuneGridSearchCV,
    search_method_params={
        "verbose": 0,
        "num_cpus_per_trial": 1,
        "max_iters": 1,
        "early_stopping": False,
        "mode": "max",
        "scoring": "r2",
        "pipeline_auto_early_stop": False,
        "keep_best_configurations": 2
    },
    search_data_size=1000,
    incremental_training=True,
    drift_detector=SAILDriftDetector(model=EDDM(), drift_param="difference"),
    pipeline_strategy="DetectAndIncrement",
)

[2023-07-03 00:20:29:468] - INFO - SAIL : Pipeline Strategy [DetectAndIncrement] created with actions: ['DATA_COLLECTION', 'FIND_BEST_PIPELINE', 'SCORE_AND_DETECT_DRIFT', 'PARTIAL_FIT_MODEL']


#### Start Training


In [7]:
y_preds = []
y_true = []
batch_size = 50

start = 0
for end in range(50, 1501, batch_size):

    X_train = X.iloc[start:end]
    y_train = y.iloc[start:end]

    if end > 1000: # search_data_size is 1000
        preds = auto_pipeline.predict(X_train)
        y_preds.extend(list(preds))
        y_true.extend(list(y_train))

    auto_pipeline.train(X_train, y_train)
    start = end


SAIL Pipeline Tuning in progress... [Elapsed: 19.89s, Trials=7/7, Class=SAILTuneGridSearchCV, CPU=10, GPU=0, Cluster Memory=40.83 GB]   
[2023-07-03 00:20:52:230] - INFO - SAIL : Pipeline tuning completed. Shutting down Ray cluster...
[2023-07-03 00:20:54:560] - INFO - SAIL : Found best params: {'regressor': AdaptiveRandomForestRegressor(n_models=15), 'regressor__n_models': 15}


>>>--------------------------------------------------------------------------------------------




>>>--------------------------------------------------------------------------------------------




>>>--------------------------------------------------------------------------------------------




>>>--------------------------------------------------------------------------------------------




>>>--------------------------------------------------------------------------------------------




>>>--------------------------------------------------------------------------------------------




>>>--------------------------------------------------------------------------------------------




>>>--------------------------------------------------------------------------------------------




>>>--------------------------------------------------------------------------------------------




>>>--------------------------------------------------------------------------------------------




#### Final Score


In [8]:
auto_pipeline.progressive_score

0.6497283120268231

### Plot predictions


In [9]:
import plotly.express as px

df = pd.DataFrame({"y_true": y_true, "y_preds": y_preds}).head(500)
fig = px.line(df, y=["y_true", "y_preds"], title='')
fig.show()