In [1]:
import numpy as np
import pandas as pd
from river import optim
from river import metrics
from river.drift import ADWIN, EDDM
from sail.models.auto_ml.tune import SAILTuneGridSearchCV
from sail.models.river.forest import AdaptiveRandomForestClassifier
from sail.models.river.linear_model import LogisticRegression
from sail.models.auto_ml.auto_pipeline import SAILAutoPipeline
from sail.pipeline import SAILPipeline
from sklearn.impute import SimpleImputer
from sail.drift_detection.drift_detector import SAILDriftDetector
from sail.transfomers.river.preprocessing import StandardScaler

#### Load Data


In [2]:
X = pd.read_csv("../../datasets/agrawal.csv").head(50000)

y = X["class"]
X.drop("class", axis=1, inplace=True)

#### Model Definition


In [3]:
logistic_reg = LogisticRegression(optimizer=optim.SGD(0.1))
random_forest = AdaptiveRandomForestClassifier(n_models=10)

#### Create SAIL Pipeline


In [4]:
steps = [
    ("Imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("standard_scalar", StandardScaler()),
    ("classifier", "passthrough"),
]
sail_pipeline = SAILPipeline(steps=steps, scoring=metrics.Accuracy)

#### HPO Params


In [5]:
params_grid = [
    {
        "classifier": [logistic_reg],
        "classifier__l2": [0.1, 0.9],
        "classifier__intercept_init": [0.2, 0.5],
    },
    {
        "classifier": [random_forest],
        "classifier__n_models": [5, 10],
        "Imputer": ["passthrough"],
    },
]

# params_grid = {
#     "classifier__l2": [0.1, 0.9],
#     "classifier__intercept_init": [0.2, 0.5],
# }


#### Create Model Instance


In [6]:
auto_pipeline = SAILAutoPipeline(
    pipeline=sail_pipeline,
    pipeline_params_grid=params_grid,
    search_method=SAILTuneGridSearchCV,
    search_method_params={
        "num_cpus_per_trial": 1,
        "max_iters": 1,
        "early_stopping": False,
        "mode": "max",
        "scoring": "accuracy",
        "pipeline_auto_early_stop": False,
        "keep_best_configurations": 2
    },
    search_data_size=1000,
    incremental_training=True,
    drift_detector=SAILDriftDetector(model=EDDM(), drift_param="difference"),
    pipeline_strategy="DetectAndIncrement",
)

[2023-07-11 21:25:23:907] - INFO - SAIL : Pipeline Strategy [DetectAndIncrement] created with actions: ['DATA_COLLECTION', 'FIND_BEST_PIPELINE', 'SCORE_AND_DETECT_DRIFT', 'PARTIAL_FIT_MODEL']


#### Start Training


In [7]:
y_preds = []
y_true = []
batch_size = 50

for start in range(0, 1500, batch_size):

    end = start + batch_size

    X_train = X.iloc[start:end]
    y_train = y.iloc[start:end]

    if end > 1000: # search_data_size is 1000
        preds = auto_pipeline.predict(X_train)
        if preds is not None:
            y_preds.extend(list(preds))
            y_true.extend(list(y_train))

    auto_pipeline.train(X_train, y_train, classifier__classes=[1, 0])


0,1
Current time:,2023-07-11 21:25:36
Running for:,00:00:08.39
Memory:,30.9/64.0 GiB

Trial name,status,loc,Imputer,classifier,classifier__intercep t_init,classifier__l2,classifier__n_models,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_1a3ba181,TERMINATED,127.0.0.1:92102,,LogisticRegress_b100,0.2,0.1,,1,0.174084,0.59,0.655,0.545
_Trainable_44dedec9,TERMINATED,127.0.0.1:92107,,LogisticRegress_25e0,0.2,0.9,,1,0.168272,0.67,0.68,0.63
_Trainable_ad8a85dd,TERMINATED,127.0.0.1:92108,,LogisticRegress_c820,0.5,0.1,,1,0.166972,0.585,0.65,0.55
_Trainable_e43e3e22,TERMINATED,127.0.0.1:92109,,LogisticRegress_f100,0.5,0.9,,1,0.166088,0.665,0.68,0.63
_Trainable_c074ae37,TERMINATED,127.0.0.1:92110,passthrough,AdaptiveRandomF_8280,,,5.0,1,2.00799,0.88,0.825,0.935
_Trainable_dd26f3e5,TERMINATED,127.0.0.1:92111,passthrough,AdaptiveRandomF_9190,,,10.0,1,3.88383,0.95,0.935,0.875


[2m[36m(_Trainable pid=92102)[0m DHAVAL 0
[2m[36m(_Trainable pid=92102)[0m DHAVAL 0
[2m[36m(_Trainable pid=92102)[0m DHAVAL 0
[2m[36m(_Trainable pid=92102)[0m DHAVAL 0
[2m[36m(_Trainable pid=92102)[0m DHAVAL 0
[2m[36m(_Trainable pid=92111)[0m DHAVAL 0[32m [repeated 25x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m


2023-07-11 21:25:36,247	INFO tune.py:945 -- Total run time: 8.40 seconds (8.38 seconds for the tuning loop).


DHAVAL 1


[2023-07-11 21:25:39:256] - INFO - SAIL : Pipeline tuning completed. Disconnecting Ray cluster...
[2023-07-11 21:25:39:257] - INFO - SAIL : Found best params: {'Imputer': 'passthrough', 'classifier': AdaptiveRandomForestClassifier(), 'classifier__n_models': 10}


>>>--------------------------------------------------------------------------------------------




DHAVAL 1




>>>--------------------------------------------------------------------------------------------




DHAVAL 1




>>>--------------------------------------------------------------------------------------------




DHAVAL 1




>>>--------------------------------------------------------------------------------------------




DHAVAL 1




>>>--------------------------------------------------------------------------------------------




DHAVAL 1




>>>--------------------------------------------------------------------------------------------




DHAVAL 1





>>>--------------------------------------------------------------------------------------------




DHAVAL 1




>>>--------------------------------------------------------------------------------------------




DHAVAL 1




>>>--------------------------------------------------------------------------------------------




DHAVAL 1




>>>--------------------------------------------------------------------------------------------




DHAVAL 1




#### Save model


In [None]:
auto_pipeline.save_model(".")

#### Load model


In [None]:
new_auto_pipeline = SAILAutoPipeline.load_model(".")

#### Continue training using the load model

In [None]:
for start in range(1501, 3001, batch_size):

    end = start + batch_size

    X_train = X.iloc[start:end]
    y_train = y.iloc[start:end]

    if end > 1000: # search_data_size is 1000
        preds = new_auto_pipeline.predict(X_train)
        if preds is not None:
            y_preds.extend(list(preds))
            y_true.extend(list(y_train))

    new_auto_pipeline.train(X_train, y_train, classifier__classes=[1, 0])


#### Classification Report


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_true, y_preds))

#### Plot confusion matrix


In [None]:
import seaborn as sns

cf_matrix = confusion_matrix(y_true, y_preds)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, fmt='.2%', cmap='Blues')