In [1]:
using Lale

In [2]:
using Random
using Statistics
using Test
using DataFrames: DataFrame
using AutoMLPipeline: Utils

In [3]:
iris = getiris();
#TODO: train-test split

## AutoML for classifier pipeline

This example uses Lale for combined algorithm selection and hyperparameter tuning
on a classifier pipeline.

In [4]:
clf_X = iris[:,1:4] |> DataFrame
clf_y = iris[:,5]   |> Vector;

In [5]:
PCA = laleoperator("PCA")
RobustScaler = laleoperator("RobustScaler")
ConcatFeatures = laleoperator("ConcatFeatures", "lale")
LogisticRegression = laleoperator("LogisticRegression")
RandomForestClassifier = laleoperator("RandomForestClassifier");

In [7]:
clf_planned = (PCA & RobustScaler) >> ConcatFeatures >> (LogisticRegression | RandomForestClassifier);
#TODO: visualize planned pipeline

In [10]:
clf_hopt = LalePipeOptimizer(clf_planned, max_evals=10, cv=3)
clf_trained = fit(clf_hopt, clf_X, clf_y);
#TODO: visualize trained pipeline

  0%|                                   | 0/10 [00:00<?, ?trial/s, best loss=?] 10%|██▏                   | 1/10 [00:00<00:06,  1.29trial/s, best loss: -0.96] 20%|████▍                 | 2/10 [00:01<00:04,  1.96trial/s, best loss: -0.96] 30%|██████▌               | 3/10 [00:01<00:03,  2.24trial/s, best loss: -0.96] 40%|████████▊             | 4/10 [00:01<00:02,  2.55trial/s, best loss: -0.96] 50%|███████████           | 5/10 [00:02<00:02,  2.24trial/s, best loss: -0.96] 60%|█████████████▏        | 6/10 [00:02<00:01,  2.04trial/s, best loss: -0.96] 70%|███████████████▍      | 7/10 [00:03<00:01,  2.20trial/s, best loss: -0.96] 80%|█████████████████▌    | 8/10 [00:03<00:00,  2.37trial/s, best loss: -0.98] 90%|███████████████████▊  | 9/10 [00:04<00:00,  2.46trial/s, best loss: -0.98]100%|█████████████████████| 10/10 [00:04<00:00,  2.64trial/s, best loss: -0.98]100%|█████████████████████| 10/10 [00:04<00:00,  2.31trial/s, best loss: -0.98]

In [11]:
#TODO: change to predict, not transform
clf_pred = transform(clf_trained, clf_X)
clf_accu = score(:accuracy, clf_pred, clf_y)

98.0

## AutoML for regressor pipeline

This example uses Lale for combined algorithm selection and hyperparameter tuning
on a regressor pipeline.

In [12]:
reg_X = iris[:,1:3] |> DataFrame
reg_y = iris[:,4]   |> Vector;

In [13]:
PCA = laleoperator("PCA")
NoOp = laleoperator("NoOp", "lale")
LinearRegression = laleoperator("LinearRegression")
RandomForestRegressor = laleoperator("RandomForestRegressor");

In [14]:
reg_planned = (PCA | NoOp) >> (LinearRegression | RandomForestRegressor);
#TODO: visualize planned pipeline

In [16]:
reg_hopt = LalePipeOptimizer(reg_planned, max_evals=10, cv=3)
reg_trained = fit(reg_hopt, reg_X, reg_y);
#TODO: visualize trained pipeline


  0%|                                   | 0/10 [00:00<?, ?trial/s, best loss=?] 10%|█         | 1/10 [00:00<00:03,  2.57trial/s, best loss: 49.60474466581064] 20%|█▊       | 2/10 [00:00<00:02,  3.45trial/s, best loss: 0.8193203131550298] 30%|██▋      | 3/10 [00:00<00:02,  3.44trial/s, best loss: 0.8193203131550298] 40%|███▌     | 4/10 [00:01<00:01,  3.95trial/s, best loss: 0.8193203131550298] 50%|████▌    | 5/10 [00:01<00:01,  2.99trial/s, best loss: 0.8193203131550298] 60%|█████▍   | 6/10 [00:02<00:01,  2.54trial/s, best loss: 0.8193203131550298] 70%|██████▎  | 7/10 [00:02<00:01,  2.88trial/s, best loss: 0.8193203131550298] 80%|███████▏ | 8/10 [00:02<00:00,  3.25trial/s, best loss: 0.6099374987699132] 90%|████████ | 9/10 [00:02<00:00,  3.44trial/s, best loss: 0.6099374987699132]100%|████████| 10/10 [00:02<00:00,  3.88trial/s, best loss: 0.6099374987699132]100%|████████| 10/10 [00:02<00:00,  3.34trial/s, best loss: 0.6099374987699132]

In [17]:
#TODO: change to predict, not transform
reg_pred = transform(reg_trained, reg_X)
reg_rmse = score(:rmse, reg_pred, reg_y)

0.1905638165002068

## Other stuff

In [18]:
PCA |> typeof |> supertypes

(LaleOp, Lale.LaleAbsTypes.LaleOperator, Learner, Computer, Machine, Any)

In [19]:
RandomForestRegressor |> typeof |> supertypes

(LaleOp, Lale.LaleAbsTypes.LaleOperator, Learner, Computer, Machine, Any)

In [16]:
reg_hopt |> typeof |> supertypes

(LaleOptimizer, Lale.LaleAbsTypes.LaleOperator, Learner, Computer, Machine, Any)

In [20]:
# regression using AMLP pipeline
amlpipe = @pipeline  (PCA + NoOp) |> (RandomForestRegressor * LinearRegression)
amlpred = fit_transform!(amlpipe, reg_X, reg_y)
crossvalidate(amlpipe, reg_X, reg_y, "mean_squared_error")
amlprmse = score(:rmse, amlpred, reg_y)

fold: 1, 0.03991648148148145
fold: 2, 0.04070537037037037
fold: 3, 0.03292666666666664
fold: 4, 0.0504333333333333
fold: 5, 0.03168870748299319
fold: 6, 0.03315500000000006
fold: 7, 0.045158518518518515
fold: 8, 0.03477799319727889
fold: 9, 0.04218166666666665
fold: 10, 0.04149648148148148
errors: 0


0.08686739361851532

In [21]:
amlpipe |> typeof |> supertypes

(Pipeline, Workflow, Machine, Any)

In [22]:
# classification using AMLP pipeline
amlpipe = @pipeline  (PCA + RobustScaler) |> RandomForestClassifier
amlpred = fit_transform!(amlpipe, clf_X, clf_y)
crossvalidate(amlpipe, clf_X, clf_y, "accuracy_score")
amlpacc = score(:accuracy, amlpred, clf_y)

fold: 1, 1.0
fold: 2, 1.0
fold: 3, 1.0
fold: 4, 0.9333333333333333
fold: 5, 0.8666666666666667
fold: 6, 0.9333333333333333
fold: 7, 0.9333333333333333
fold: 8, 0.9333333333333333
fold: 9, 0.8666666666666667
fold: 10, 1.0
errors: 0


99.33333333333333

In [23]:
# amlp ops
ohe  = OneHotEncoder()
catf = CatFeatureSelector()
numf = NumFeatureSelector();
#TODO: use Lale OneHotEncoder and Lale Project operators

In [24]:
plr = @pipeline (catf |> ohe) + (numf |> RobustScaler |> PCA) |> RandomForestRegressor;
crossvalidate(plr, reg_X, reg_y, "mean_absolute_error", 10, false) 

(mean = 0.1671011111111111, std = 0.033359377664898084, folds = 10, errors = 0)

In [25]:
plc = @pipeline (catf |> ohe) + (numf |> RobustScaler |> PCA) |> RandomForestClassifier;
crossvalidate(plc, clf_X, clf_y,"accuracy_score",10,false) 

(mean = 0.9666666666666666, std = 0.04714045207910316, folds = 10, errors = 0)