In [1]:
import numpy as np
import pandas as pd
from river import optim
from river import metrics
from river.drift.binary import EDDM
from sail.telemetry import TracingClient
from sail.models.auto_ml.tune import SAILTuneGridSearchCV
from sail.models.river.forest import AdaptiveRandomForestRegressor
from sail.models.sklearn.linear_model import SGDRegressor
from sail.models.auto_ml.auto_pipeline import SAILAutoPipeline
from sail.pipeline import SAILPipeline
from sklearn.impute import SimpleImputer
from sail.models.torch.rnn import RNNRegressor
from sail.drift_detection.drift_detector import SAILDriftDetector
from sail.transformers.river.preprocessing import StandardScaler

#### Load Data


In [2]:
X = pd.read_csv("../../datasets/HDWF2.csv")

y = X["power"]
X.drop(["power", "time"], axis=1, inplace=True)

#### Model Definition


In [3]:
# Scikit-learn
sgd_regressor = SGDRegressor(alpha=0.0001)

# River
random_forest = AdaptiveRandomForestRegressor()

# Torch
learner_gru = RNNRegressor(
    input_units=12,
    output_units=1,
    hidden_units=100,
    n_hidden_layers=3,
    lr=0.001,
    cell_type="GRU",
    verbose=0,
)

#### Create SAIL Pipeline


In [5]:
steps = [
    ("Imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("standard_scalar", StandardScaler()),
    ("regressor", "passthrough"),
]
sail_pipeline = SAILPipeline(
    steps=steps, scoring=["R2"], verbosity_level=1, verbosity_interval=2
)

#### HPO Params


In [6]:
params_grid = [
    {
        "regressor": [sgd_regressor],
        "regressor__l1_ratio": [0.15, 0.20],
        "regressor__tol": [0.003, 0.002],
    },
    {"regressor": [random_forest], "regressor__n_models": [10, 15, 20]},
    {
        "regressor": [learner_gru],
        "regressor__input_units": [12],
        "regressor__output_units": [1],
        "regressor__hidden_units": [50, 100],
    },
]

#### Create Model Instance


In [7]:
tracer = TracingClient(
    service_name="SAILAutoPipeline-Regression",
    otlp_endpoint="http://83.212.75.52:31318",
)
auto_pipeline = SAILAutoPipeline(
    pipeline=sail_pipeline,
    pipeline_params_grid=params_grid,
    search_method=SAILTuneGridSearchCV,
    search_method_params={
        "num_cpus_per_trial": 1,
        "max_iters": 1,
        "early_stopping": False,
        "mode": "max",
        "scoring": "r2",
        "pipeline_auto_early_stop": False,
        "keep_best_configurations": 2,
    },
    search_data_size=500,
    incremental_training=True,
    drift_detector=SAILDriftDetector(model=EDDM(), drift_param="difference"),
    pipeline_strategy="DetectAndIncrement",
    verbosity_level=1,
    verbosity_interval=2,
    tensorboard_log_dir="/Users/dhaval/Projects/Tensorboard_logs",
    tracer=tracer,
)

[2023-12-15 17:41:34:117] - INFO - SAIL (PipelineStrategy) - Pipeline Strategy [DetectAndIncrement] created with actions: ['DATA_COLLECTION', 'FIND_BEST_PIPELINE', 'SCORE_AND_DETECT_DRIFT', 'PARTIAL_FIT_MODEL']


#### Start Training


In [8]:
y_preds = []
y_true = []
batch_size = 50

In [9]:
batch_size = 50
with tracer.trace_as_current_span(span_name="Pipeline-Train"):
    for start in range(0, 1000, batch_size):
        end = start + batch_size

        X_train = X.iloc[start:end]
        y_train = y.iloc[start:end]

        if end > 500:  # search_data_size is 500
            preds = auto_pipeline.predict(X_train)
            y_preds.extend(list(preds))
            y_true.extend(list(y_train))

        auto_pipeline.train(X_train, y_train)

0,1
Current time:,2023-12-15 17:42:15
Running for:,00:00:11.34
Memory:,32.1/64.0 GiB

Trial name,status,loc,regressor,regressor__hidden_un its,regressor__input_uni ts,regressor__l1_ratio,regressor__n_models,regressor__output_un its,regressor__tol,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_baaf28fa,TERMINATED,127.0.0.1:11639,SGDRegressor(),,,0.15,,,0.003,1,0.0528138,0.732875,0.436958,-0.0560196
_Trainable_eb411859,TERMINATED,127.0.0.1:11639,SGDRegressor(),,,0.15,,,0.002,1,0.050256,0.737458,0.449437,-0.0843729
_Trainable_b242e870,TERMINATED,127.0.0.1:11639,SGDRegressor(),,,0.2,,,0.003,1,0.0537822,0.734884,0.447927,-0.0390967
_Trainable_7ed87743,TERMINATED,127.0.0.1:11639,SGDRegressor(),,,0.2,,,0.002,1,0.057987,0.730399,0.436981,-0.0352027
_Trainable_03b0e2b6,TERMINATED,127.0.0.1:11639,AdaptiveRandomF_5750,,,,10.0,,,1,3.00886,0.63241,-0.114111,-1.17354
_Trainable_eb2f844e,TERMINATED,127.0.0.1:11652,AdaptiveRandomF_4d00,,,,15.0,,,1,4.25646,0.510226,0.0187798,-1.37097
_Trainable_0592bc02,TERMINATED,127.0.0.1:11639,AdaptiveRandomF_1bd0,,,,20.0,,,1,5.68677,0.686286,-0.242424,-0.836628
_Trainable_e318e429,TERMINATED,127.0.0.1:11659,<class 'sail.mo_3520,50.0,12.0,,,1.0,,1,0.100128,-2.39795,-7.28219,-0.630165
_Trainable_45aed57d,TERMINATED,127.0.0.1:11659,<class 'sail.mo_2260,100.0,12.0,,,1.0,,1,0.0945449,-2.40088,-7.30701,-0.625912


[2023-12-15 17:42:17:75] - INFO - SAIL (PipelineStrategy) - Pipeline tuning completed. Disconnecting Ray cluster...
[2023-12-15 17:42:17:76] - INFO - SAIL (PipelineStrategy) - Found best params: {'regressor': SGDRegressor(l1_ratio=0.2, tol=0.003), 'regressor__l1_ratio': 0.2, 'regressor__tol': 0.003}
[2023-12-15 17:42:17:82] - INFO - SAIL (TensorboardWriter) - Sending training output to Tensorboard logs. Please run `tensorboard --logdir /Users/dhaval/Projects/Tensorboard_logs/Training_Logs` in terminal to start tensorboard server and track training progress.

    
>> Epoch: 12 | Samples Seen: 550 -------------------------------------------------------------------------------------

    
>> Epoch: 14 | Samples Seen: 650 -------------------------------------------------------------------------------------

    
>> Epoch: 16 | Samples Seen: 750 -------------------------------------------------------------------------------------

    
>> Epoch: 18 | Samples Seen: 850 ----------------------

#### Save Model


In [10]:
auto_pipeline.save_model(".")

'./sail_auto_pipeline'

#### Load Model


In [11]:
new_auto_pipeline = SAILAutoPipeline.load_model(".")

[2023-12-15 17:42:33:368] - INFO - SAIL (PipelineStrategy) - Pipeline Strategy [DetectAndIncrement] created with actions: ['DATA_COLLECTION', 'FIND_BEST_PIPELINE', 'SCORE_AND_DETECT_DRIFT', 'PARTIAL_FIT_MODEL']
[2023-12-15 17:42:33:370] - INFO - SAIL (TensorboardWriter) - Sending training output to Tensorboard logs. Please run `tensorboard --logdir /Users/dhaval/Projects/Tensorboard_logs/Training_Logs` in terminal to start tensorboard server and track training progress.


#### Continue training using the load model


In [12]:
for start in range(1501, 2001, batch_size):
    end = start + batch_size

    X_train = X.iloc[start:end]
    y_train = y.iloc[start:end]

    preds = new_auto_pipeline.predict(X_train)
    y_preds.extend(list(preds))
    y_true.extend(list(y_train))

    new_auto_pipeline.train(X_train, y_train)


    
>> Epoch: 22 | Samples Seen: 1050 -------------------------------------------------------------------------------------



    
>> Epoch: 24 | Samples Seen: 1150 -------------------------------------------------------------------------------------

    
>> Epoch: 26 | Samples Seen: 1250 -------------------------------------------------------------------------------------

    
>> Epoch: 28 | Samples Seen: 1350 -------------------------------------------------------------------------------------

    
>> Epoch: 30 | Samples Seen: 1450 -------------------------------------------------------------------------------------


#### Final Score


In [13]:
new_auto_pipeline.progressive_score

0.7472256780765789

### Plot predictions


In [14]:
import plotly.express as px

df = pd.DataFrame({"y_true": y_true, "y_preds": y_preds}).head(500)
fig = px.line(df, y=["y_true", "y_preds"], title="")
fig.show()