In [1]:
using Lale

In [2]:
using Random
using Statistics
using Test
using DataFrames: DataFrame
using AutoMLPipeline: Utils

In [3]:
iris = getiris();
#TODO: train-test split

## AutoML for classifier pipeline

This example uses Lale for combined algorithm selection and hyperparameter tuning
on a classifier pipeline.

In [4]:
clf_X = iris[:,1:4] |> DataFrame
clf_y = iris[:,5]   |> Vector;

In [5]:
PCA = laleoperator("PCA")
RobustScaler = laleoperator("RobustScaler")
ConcatFeatures = laleoperator("ConcatFeatures", "lale")
LogisticRegression = laleoperator("LogisticRegression")
RandomForestClassifier = laleoperator("RandomForestClassifier");

In [6]:
clf_planned = (PCA & RobustScaler) >> ConcatFeatures >> (LogisticRegression | RandomForestClassifier);
#TODO: visualize planned pipeline

In [7]:
clf_hopt = LaleOptimizer(clf_planned, "Hyperopt", max_evals=10, cv=3)
clf_trained = fit(clf_hopt, clf_X, clf_y);
#TODO: visualize trained pipeline

100%|███████| 10/10 [00:08<00:00,  1.20trial/s, best loss: -0.9666666666666667]

In [8]:
#TODO: change to predict, not transform
clf_pred = transform(clf_trained, clf_X)
clf_accu = score(:accuracy, clf_pred, clf_y)

97.33333333333334

## AutoML for regressor pipeline

This example uses Lale for combined algorithm selection and hyperparameter tuning
on a regressor pipeline.

In [9]:
reg_X = iris[:,1:3] |> DataFrame
reg_y = iris[:,4]   |> Vector;

In [10]:
PCA = laleoperator("PCA")
NoOp = laleoperator("NoOp", "lale")
LinearRegression = laleoperator("LinearRegression")
RandomForestRegressor = laleoperator("RandomForestRegressor");

In [11]:
reg_planned = (PCA | NoOp) >> (LinearRegression | RandomForestRegressor);
#TODO: visualize planned pipeline

In [12]:
reg_hopt = LaleOptimizer(reg_planned, "Hyperopt", max_evals=10, cv=3)
reg_trained = fit(reg_hopt, reg_X, reg_y);
#TODO: visualize trained pipeline


100%|████████| 10/10 [00:04<00:00,  2.11trial/s, best loss: 0.6099374987699129]

In [13]:
#TODO: change to predict, not transform
reg_pred = transform(reg_trained, reg_X)
reg_rmse = score(:rmse, reg_pred, reg_y)

0.1905638165002068

## Other stuff

In [14]:
PCA |> typeof |> supertypes

(LaleOp, Lale.LaleAbsTypes.LaleOperator, Learner, Computer, Machine, Any)

In [15]:
RandomForestRegressor |> typeof |> supertypes

(LaleOp, Lale.LaleAbsTypes.LaleOperator, Learner, Computer, Machine, Any)

In [16]:
reg_hopt |> typeof |> supertypes

(LaleOptimizer, Lale.LaleAbsTypes.LaleOperator, Learner, Computer, Machine, Any)

In [17]:
# regression using AMLP pipeline
amlpipe = @pipeline  (PCA + NoOp) |> (RandomForestRegressor * LinearRegression)
amlpred = fit_transform!(amlpipe, reg_X, reg_y)
crossvalidate(amlpipe, reg_X, reg_y, "mean_squared_error")
amlprmse = score(:rmse, amlpred, reg_y)

fold: 1, 0.03514803112962976
fold: 2, 0.029896995145880588
fold: 3, 0.034010771555555555
fold: 4, 0.049248799444444505
fold: 5, 0.040149968907407484
fold: 6, 0.01578295531481481
fold: 7, 0.03188891511451248
fold: 8, 0.04110084196296324
fold: 9, 0.06609304236583483
fold: 10, 0.02241286729936931
errors: 0


0.07133788276361826

In [18]:
amlpipe |> typeof |> supertypes

(Pipeline, Workflow, Machine, Any)

In [19]:
# classification using AMLP pipeline
amlpipe = @pipeline  (PCA + RobustScaler) |> RandomForestClassifier
amlpred = fit_transform!(amlpipe, clf_X, clf_y)
crossvalidate(amlpipe, clf_X, clf_y, "accuracy_score")
amlpacc = score(:accuracy, amlpred, clf_y)

fold: 1, 1.0
fold: 2, 1.0
fold: 3, 1.0
fold: 4, 1.0
fold: 5, 1.0
fold: 6, 0.9333333333333333
fold: 7, 0.8666666666666667
fold: 8, 1.0
fold: 9, 0.8
fold: 10, 1.0
errors: 0


100.0

In [23]:
# amlp ops
ohe  = OneHotEncoder()
catf = CatFeatureSelector()
numf = NumFeatureSelector();
#TODO: use Lale OneHotEncoder and Lale Project operators

In [21]:
plr = @pipeline (catf |> ohe) + (numf |> RobustScaler |> PCA) |> RandomForestRegressor;
crossvalidate(plr, reg_X, reg_y, "mean_absolute_error", 10, false) 

(mean = 0.1510826190476191, std = 0.038446140923021906, folds = 10, errors = 0)

In [22]:
plc = @pipeline (catf |> ohe) + (numf |> RobustScaler |> PCA) |> RandomForestClassifier;
crossvalidate(plc, clf_X, clf_y,"accuracy_score",10,false) 

(mean = 0.9600000000000002, std = 0.06440611887195305, folds = 10, errors = 0)