In [1]:
import numpy as np
import pandas as pd
from river import optim
from river import metrics
from river.drift.binary import EDDM
from sail.models.auto_ml.tune import SAILTuneGridSearchCV
from sail.models.river.forest import AdaptiveRandomForestRegressor
from sail.models.river.linear_model import LinearRegression
from sail.models.auto_ml.auto_pipeline import SAILAutoPipeline
from sail.pipeline import SAILPipeline
from sklearn.impute import SimpleImputer
from sail.drift_detection.drift_detector import SAILDriftDetector
from sail.transformers.river.preprocessing import StandardScaler

#### Load Data


In [2]:
X = pd.read_csv("../../datasets/HDWF2.csv")

y = X["power"]
X.drop(["power", "time"], axis=1, inplace=True)

#### Model Definition


In [3]:
linear_reg = LinearRegression(optimizer=optim.SGD(0.1))
random_forest = AdaptiveRandomForestRegressor()

#### Create SAIL Pipeline


In [4]:
steps = [
    ("Imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("standard_scalar", StandardScaler()),
    ("regressor", "passthrough"),
]
sail_pipeline = SAILPipeline(steps=steps, scoring=metrics.R2)

#### HPO Params


In [5]:
params_grid = [
    {
        "regressor": [linear_reg],
        "regressor__l2": [0.1, 0.9],
        "regressor__intercept_init": [0.2, 0.5],
    },
    {
        "regressor": [random_forest],
        "regressor__n_models": [10, 15, 20]
    },
]

#### Create Model Instance


In [6]:
auto_pipeline = SAILAutoPipeline(
    pipeline=sail_pipeline,
    pipeline_params_grid=params_grid,
    search_method=SAILTuneGridSearchCV,
    search_method_params={
        "num_cpus_per_trial": 1,
        "max_iters": 1,
        "early_stopping": False,
        "mode": "max",
        "scoring": "r2",
        "pipeline_auto_early_stop": False,
        "keep_best_configurations": 2
    },
    search_data_size=1000,
    pipeline_strategy="PrequentialTraining",
    verbosity_level=1, 
    verbosity_interval=4,
    tensorboard_log_dir=None,
    tracer=None
)

[2023-10-02 01:52:56:339] - INFO - SAIL (PipelineStrategy) - Pipeline Strategy [PrequentialTraining] created with actions: ['DATA_COLLECTION', 'FIND_BEST_PIPELINE', 'PARTIAL_FIT_PIPELINE']


#### Start Training


In [7]:
y_preds = []
y_true = []
batch_size = 50

for start in range(0, 1000, batch_size):

    end = start + batch_size

    X_train = X.iloc[start:end]
    y_train = y.iloc[start:end]

    if end > 1000: # search_data_size is 1000
        preds = auto_pipeline.predict(X_train)
        if preds is not None:
            y_preds.extend(list(preds))
            y_true.extend(list(y_train))

    auto_pipeline.train(X_train, y_train)


0,1
Current time:,2023-10-02 01:53:19
Running for:,00:00:19.16
Memory:,27.0/64.0 GiB

Trial name,status,loc,regressor,regressor__intercept _init,regressor__l2,regressor__n_models,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_9545223f,TERMINATED,127.0.0.1:69290,LinearRegressio_8e50,0.2,0.1,,1,0.126326,-2.22877e+19,-2.31076e+19,-6.51966e+19
_Trainable_c6e24800,TERMINATED,127.0.0.1:69290,LinearRegressio_f6a0,0.2,0.9,,1,0.125144,-6.82354e+17,-6.05753e+17,-5.67513e+18
_Trainable_6c887265,TERMINATED,127.0.0.1:69290,LinearRegressio_27d0,0.5,0.1,,1,0.131667,-3.12752e+19,-2.58568e+19,-6.76019e+19
_Trainable_09c6d596,TERMINATED,127.0.0.1:69290,LinearRegressio_28f0,0.5,0.9,,1,0.141962,-6.31941e+17,-5.45171e+17,-5.7334e+18
_Trainable_7dd1592c,TERMINATED,127.0.0.1:69290,AdaptiveRandomF_d0c0,,,10.0,1,6.13039,0.640991,0.738074,0.617443
_Trainable_f2e619a0,TERMINATED,127.0.0.1:69303,AdaptiveRandomF_d480,,,15.0,1,8.85006,0.707492,0.702381,0.73332
_Trainable_6f656672,TERMINATED,127.0.0.1:69308,AdaptiveRandomF_3850,,,20.0,1,11.3204,0.657537,0.801584,0.730469


[2023-10-02 01:53:23:684] - INFO - SAIL (PipelineStrategy) - Pipeline tuning completed. Disconnecting Ray cluster...
[2023-10-02 01:53:23:685] - INFO - SAIL (PipelineStrategy) - Found best params: {'regressor': AdaptiveRandomForestRegressor(drift_detector=ADWIN (
      delta=0.001
      clock=32
      max_buckets=5
      min_window_length=5
      grace_period=10
    ),
                                  metric=MSE: 0., n_models=20,
      delta=0.01
      clock=32
      max_buckets=5
      min_window_length=5
      grace_period=10
    )), 'regressor__n_models': 20}


#### Save model


In [13]:
auto_pipeline.save_model(".")

[2023-10-02 01:53:54:575] - INFO - SAIL (SAILModel) - Model saved successfully.


'./sail_auto_pipeline'

#### Load model


In [14]:
new_auto_pipeline = SAILAutoPipeline.load_model(".")

[2023-10-02 01:53:55:789] - INFO - SAIL (PipelineStrategy) - Pipeline Strategy [DetectAndIncrement] created with actions: ['DATA_COLLECTION', 'FIND_BEST_PIPELINE', 'SCORE_AND_DETECT_DRIFT', 'PARTIAL_FIT_MODEL']
[2023-10-02 01:53:56:14] - INFO - SAIL (SAILModel) - Model loaded successfully.


#### Continue trainig using the load model


In [15]:
for start in range(1000, 2001, batch_size):

    end = start + batch_size

    X_train = X.iloc[start:end]
    y_train = y.iloc[start:end]

    if end > 1000: # search_data_size is 1000
        preds = new_auto_pipeline.predict(X_train)
        if preds is not None:
            y_preds.extend(list(preds))
            y_true.extend(list(y_train))

    new_auto_pipeline.train(X_train, y_train)



    
>> Epoch: 24 | Samples Seen: 1150 -------------------------------------------------------------------------------------

    
>> Epoch: 28 | Samples Seen: 1350 -------------------------------------------------------------------------------------

    
>> Epoch: 32 | Samples Seen: 1550 -------------------------------------------------------------------------------------

    
>> Epoch: 36 | Samples Seen: 1750 -------------------------------------------------------------------------------------

    
>> Epoch: 40 | Samples Seen: 1950 -------------------------------------------------------------------------------------


#### Final Score


In [16]:
new_auto_pipeline.progressive_score

0.7014607084791555

### Plot predictions


In [17]:
import plotly.express as px

df = pd.DataFrame({"y_true": y_true, "y_preds": y_preds})
fig = px.line(df, y=["y_true", "y_preds"], title='')
fig.show()