In [1]:
using Lale

In [2]:
using Random
using Statistics
using Test
using DataFrames: DataFrame, nrow
using AutoMLPipeline: Utils

In [3]:
iris = getiris();
trx,tstx = holdout(nrow(iris),0.30)
training = iris[trx,:]
testing = iris[tstx,:];

## AutoML for classifier pipeline

This example uses Lale for combined algorithm selection and hyperparameter tuning
on a classifier pipeline.

In [4]:
clf_tr_X = training[:,1:4] |> DataFrame
clf_tr_y = training[:,5]   |> Vector
clf_tst_X = testing[:,1:4] |> DataFrame
clf_tst_y = testing[:,5] |> Vector;

In [5]:
PCA = laleoperator("PCA")
RobustScaler = laleoperator("RobustScaler")
ConcatFeatures = laleoperator("ConcatFeatures", "lale")
LogisticRegression = laleoperator("LogisticRegression")
RandomForestClassifier = laleoperator("RandomForestClassifier");

In [6]:
clf_planned = (PCA & RobustScaler) >> ConcatFeatures >> (LogisticRegression | RandomForestClassifier);
visualize(clf_planned);

In [7]:
clf_hopt = LalePipeOptimizer(clf_planned, max_evals=10, cv=3)
clf_trained = fit(clf_hopt, clf_tr_X, clf_tr_y);
visualize(clf_trained)

  0%|                                   | 0/10 [00:00<?, ?trial/s, best loss=?] 10%|▊       | 1/10 [00:00<00:08,  1.11trial/s, best loss: -0.9333333333333332] 20%|█▌      | 2/10 [00:01<00:04,  1.74trial/s, best loss: -0.9333333333333332] 30%|██▍     | 3/10 [00:01<00:03,  2.01trial/s, best loss: -0.9333333333333332] 40%|███▏    | 4/10 [00:02<00:02,  2.19trial/s, best loss: -0.9333333333333332] 50%|████    | 5/10 [00:02<00:02,  1.94trial/s, best loss: -0.9333333333333332] 60%|████▊   | 6/10 [00:03<00:02,  1.82trial/s, best loss: -0.9333333333333332] 70%|█████▌  | 7/10 [00:03<00:01,  1.94trial/s, best loss: -0.9333333333333332] 80%|██████▍ | 8/10 [00:04<00:00,  2.11trial/s, best loss: -0.9619047619047619] 90%|███████▏| 9/10 [00:04<00:00,  2.26trial/s, best loss: -0.9619047619047619]100%|███████| 10/10 [00:04<00:00,  2.46trial/s, best loss: -0.9619047619047619]100%|███████| 10/10 [00:04<00:00,  2.08trial/s, best loss: -0.9619047619047619]

In [8]:
#TODO: change to predict, not transform
clf_pred = predict(clf_trained, clf_tst_X)
clf_accu = score(:accuracy, clf_pred, clf_tst_y)

97.77777777777777

## AutoML for regressor pipeline

This example uses Lale for combined algorithm selection and hyperparameter tuning
on a regressor pipeline.

In [9]:
reg_tr_X = training[:,1:3] |> DataFrame
reg_tr_y = training[:,4]   |> Vector
reg_tst_X = testing[:,1:3] |> DataFrame
reg_tst_y = testing[:,4]   |> Vector;

In [10]:
PCA = laleoperator("PCA")
NoOp = laleoperator("NoOp", "lale")
LinearRegression = laleoperator("LinearRegression")
RandomForestRegressor = laleoperator("RandomForestRegressor");

In [11]:
reg_planned = (PCA | NoOp) >> (LinearRegression | RandomForestRegressor);
visualize(reg_planned)

In [12]:
reg_hopt = LalePipeOptimizer(reg_planned, max_evals=10, cv=3)
reg_trained = fit(reg_hopt, reg_tr_X, reg_tr_y);
visualize(reg_trained)


  0%|                                   | 0/10 [00:00<?, ?trial/s, best loss=?] 10%|▉        | 1/10 [00:00<00:03,  2.52trial/s, best loss: -0.877020328341168] 20%|█▌      | 2/10 [00:00<00:02,  3.55trial/s, best loss: -0.9370994271945601] 30%|██▍     | 3/10 [00:00<00:01,  3.67trial/s, best loss: -0.9370994271945601] 40%|███▏    | 4/10 [00:01<00:01,  4.20trial/s, best loss: -0.9370994271945601] 50%|████    | 5/10 [00:01<00:01,  3.00trial/s, best loss: -0.9370994271945601] 60%|████▊   | 6/10 [00:02<00:01,  2.61trial/s, best loss: -0.9370994271945601] 70%|█████▌  | 7/10 [00:02<00:01,  2.94trial/s, best loss: -0.9370994271945601] 80%|██████▍ | 8/10 [00:02<00:00,  3.24trial/s, best loss: -0.9370994271945601] 90%|███████▏| 9/10 [00:02<00:00,  3.44trial/s, best loss: -0.9370994271945601]100%|███████| 10/10 [00:02<00:00,  3.96trial/s, best loss: -0.9370994271945601]100%|███████| 10/10 [00:02<00:00,  3.40trial/s, best loss: -0.9370994271945601]

In [13]:
#TODO: change to predict, not transform
reg_pred = transform(reg_trained, reg_tst_X)
reg_rmse = score(:rmse, reg_pred, reg_tst_y)

0.21393231685700517

## Other stuff

In [14]:
PCA |> typeof |> supertypes

(LaleOp, Lale.LaleAbsTypes.LaleOperator, Learner, Computer, Machine, Any)

In [15]:
RandomForestRegressor |> typeof |> supertypes

(LaleOp, Lale.LaleAbsTypes.LaleOperator, Learner, Computer, Machine, Any)

In [16]:
reg_hopt |> typeof |> supertypes

(LalePipeOptimizer, Lale.LaleAbsTypes.LaleOperator, Learner, Computer, Machine, Any)

In [17]:
# regression using AMLP pipeline
amlpipe = @pipeline  (PCA + NoOp) |> (RandomForestRegressor * LinearRegression)
crossvalidate(amlpipe, reg_tr_X, reg_tr_y, "mean_squared_error")
amlpred = fit!(amlpipe, reg_tr_X, reg_tr_y)
amlpred = transform!(amlpipe, reg_tst_X)
amlprmse = score(:rmse, amlpred, reg_tst_y)

fold: 1, 0.013067777777777786
fold: 2, 0.033816161616161605
fold: 3, 0.013980363636363635
fold: 4, 0.031192499999999974
fold: 5, 0.05060499999999994
fold: 6, 0.02585252525252526
fold: 7, 0.048911363636363635
fold: 8, 0.041764444444444423
fold: 9, 0.016301111111111145
fold: 10, 0.05396490909090912
errors: 0


0.18304691431298745

In [18]:
amlpipe |> typeof |> supertypes

(Pipeline, Workflow, Machine, Any)

In [19]:
# classification using AMLP pipeline
amlpipe = @pipeline  (PCA + RobustScaler) |> RandomForestClassifier
crossvalidate(amlpipe, clf_tr_X, clf_tr_y, "accuracy_score")
fit!(amlpipe, clf_tr_X, clf_tr_y)
amlpred = transform!(amlpipe,clf_tst_X)
amlpacc = score(:accuracy, amlpred, clf_tst_y)

fold: 1, 0.8
fold: 2, 1.0
fold: 3, 0.9090909090909091
fold: 4, 1.0
fold: 5, 1.0
fold: 6, 0.8181818181818182
fold: 7, 1.0
fold: 8, 1.0
fold: 9, 1.0
fold: 10, 1.0
errors: 0


97.77777777777777

In [20]:
# amlp ops
ohe  = OneHotEncoder()
catf = CatFeatureSelector()
numf = NumFeatureSelector();
#TODO: use Lale OneHotEncoder and Lale Project operators

In [21]:
plr = @pipeline (catf |> ohe) + (numf |> RobustScaler |> PCA) |> RandomForestRegressor;
crossvalidate(plr, reg_tr_X, reg_tr_y, "mean_absolute_error", 10, false) 

(mean = 0.1343659090909091, std = 0.04278185079085658, folds = 10, errors = 0)

In [22]:
plc = @pipeline (catf |> ohe) + (numf |> RobustScaler |> PCA) |> RandomForestClassifier;
crossvalidate(plc, clf_tr_X, clf_tr_y,"accuracy_score",10,false) 

(mean = 0.9054545454545455, std = 0.09793084271099843, folds = 10, errors = 0)