# Full notebook
This notebook has the whole process put together:
1. Loading dataset
2. Defining grammar
3. Searching through the grammar and evaluating candidate pipelines

## 0. Imports

In [65]:
# uncomment the following if not all packages are added.

# import Pkg
# using Pkg
# Pkg.add("HTTP")
# Pkg.add("JSON")
# Pkg.add("DataFrames")
# Pkg.add("OpenML")
# Pkg.add("DataFrames") 
# Pkg.add("CSV") 
# Pkg.add("Suppressor")

In [66]:
using ScikitLearn
using ScikitLearn.Pipelines: Pipeline, FeatureUnion
using ScikitLearn.CrossValidation: cross_val_score
using XGBoost
using Revise
using Random
using Statistics: mean
using ExprRules: get_executable
using Suppressor

include("../Herb.jl/src/Herb.jl")
include("helper.jl")



get_class_type_dataset (generic function with 1 method)

In [67]:
# import the sk-learn functions
@sk_import decomposition: (PCA)
@sk_import preprocessing: (StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Binarizer, PolynomialFeatures)
@sk_import feature_selection: (VarianceThreshold, SelectKBest, SelectPercentile, SelectFwe, RFE)
@sk_import tree: (DecisionTreeClassifier)
@sk_import ensemble: (RandomForestClassifier, GradientBoostingClassifier)
@sk_import linear_model: (LogisticRegression)
@sk_import neighbors: (NearestNeighbors)
@sk_import svm: (LinearSVC)



PyObject <class 'sklearn.svm._classes.LinearSVC'>

## 1. Loading datasets

In [68]:
# load dataset
dataset = get_dataset(61)

# it does not work for the seeds table dataset!
# dataset = get_dataset(1499)

# does not work either on dataset 1464!

# shuffle the dataset
dataset_shuffled = dataset[shuffle(1:end), :]

# split into train and test sets (90:10)
split_index = floor(Int, size(dataset_shuffled, 1) * 0.9)
train_df = dataset_shuffled[1:split_index, :]
test_df = dataset_shuffled[split_index+1:end, :]

# show first five entries
first(dataset_shuffled, 5)

Row,sepallength,sepalwidth,petallength,petalwidth,class
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Cat…
1,5.4,3.4,1.5,0.4,Iris-setosa
2,6.8,2.8,4.8,1.4,Iris-versicolor
3,6.9,3.1,5.1,2.3,Iris-virginica
4,6.4,2.8,5.6,2.1,Iris-virginica
5,6.5,2.8,4.6,1.5,Iris-versicolor


In [69]:
# show metadata
print("size: ", size(dataset_shuffled))
describe(dataset_shuffled)

size: (150, 5)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,sepallength,5.84333,4.3,5.8,7.9,0,Float64
2,sepalwidth,3.054,2.0,3.0,4.4,0,Float64
3,petallength,3.75867,1.0,4.35,6.9,0,Float64
4,petalwidth,1.19867,0.1,1.3,2.5,0,Float64
5,class,,Iris-setosa,,Iris-virginica,0,"CategoricalValue{String, UInt32}"


In [70]:
# split into features and labels
train_X = train_df[:, 1:end-1]
train_Y = train_df[:, end]
test_X = test_df[:, 1:end-1]
test_Y = test_df[:, end]

# print ratio train/test
ratio = trunc(Int, 10.0 * (size(train_df)[1] / (size(train_df)[1] + size(test_df)[1])))
print("train:test ratio = ", ratio , ":", (10-ratio))

train:test ratio = 9:1

## 2. Defining grammar

In [71]:
grammar = Herb.HerbGrammar.@cfgrammar begin

    # this is the version with multiple classifiers possible
    # START   = CLASSIF | sequence(PRE, CLASSIF)
    # PRE     = PREPROC | FSELECT | sequence(PRE, PRE) | parallel(BRANCH, BRANCH)
    # BRANCH  = PRE | CLASSIF | sequence(PRE, CLASSIF) 

    # this is the version with only one classifier
    START   = Pipeline([CLASSIF]) | Pipeline([PRE, CLASSIF])
    PRE     = PREPROC | FSELECT | ("seq", Pipeline([PRE, PRE]))  | ("par", FeatureUnion([PRE, PRE])) 

    # preprocessing functions
    PREPROC =   
        ("StandardScaler", StandardScaler()) |
        ("RobustScaler", RobustScaler()) |
        ("MinMaxScaler", MinMaxScaler()) |
        ("MaxAbsScaler", MaxAbsScaler()) |
        ("PCA", PCA()) |
        ("Binarizer", Binarizer()) |
        ("PolynomialFeatures", PolynomialFeatures())

    # feature selection functions
    FSELECT =  
        ("VarianceThreshold", VarianceThreshold()) |
        ("SelectKBest",  SelectKBest()) |
        ("SelectPercentile",  SelectPercentile()) |
        ("SelectFwe",  SelectFwe()) |
        ("Recursive Feature Elimination",  RFE(LinearSVC())) 

    # classifiers
    CLASSIF =
        ("DecisionTree", DecisionTreeClassifier()) |
        ("RandomForest", RandomForestClassifier()) |
        ("Gradient Boosting Classifier", GradientBoostingClassifier()) |
        ("LogisticRegression", LogisticRegression()) |
        ("NearestNeighborClassifier", NearestNeighbors())
end
;

In [72]:
# print out all pipelines that can be assembled in two steps
cfe_print = Herb.HerbSearch.ContextFreeEnumerator(grammar, 2, :START)
for rule in cfe_print
    println(Herb.HerbSearch.rulenode2expr(rule, grammar))
end

Pipeline([("DecisionTree", DecisionTreeClassifier())])
Pipeline([("RandomForest", RandomForestClassifier())])
Pipeline([("Gradient Boosting Classifier", GradientBoostingClassifier())])
Pipeline([("LogisticRegression", LogisticRegression())])
Pipeline([("NearestNeighborClassifier", NearestNeighbors())])


## 3. Search

In [73]:
"""
Fits the pipeline to the training set and measures the accuracy on test set.
input:  pipeline, train_X, train_Y, test_X, test_Y
output: accuracy of pipeline
"""
function evaluate_pipeline(pipeline, train_X, train_Y, test_X, test_Y)

    # # this gives the following warning often, so it is suppressed for now.
    # # ConvergenceWarning: lbfgs failed to converge
    @suppress_err begin
        
        # fit the pipeline
        model = ScikitLearn.fit!(pipeline, Matrix(train_X), Array(train_Y))

        # make predictions
        predictions = ScikitLearn.predict(model, Matrix(test_X))

        # measure the accuracy
        accuracy = mean(predictions .== test_Y)
    end
end

evaluate_pipeline

In [76]:
"""This function enumerates the grammar and finds the best pipeline. """
function find_best_pipeline_with_bfs_search(pipeline, train_X, train_Y, test_X, test_Y, search_depth)
    best_accuracy = -1.0
    best_pipeline = nothing

    # enumerate the gramamar
    enumerator = Herb.HerbSearch.ContextFreeEnumerator(grammar, search_depth, :START)
    for rule in enumerator
        try
            # get pipeline and calculate accuracy
            pipeline = eval(Herb.HerbSearch.rulenode2expr(rule, grammar))
            accuracy = evaluate_pipeline(pipeline, train_X, train_Y, test_X, test_Y)

            # update best pipeline
            if (accuracy > best_accuracy) 
                best_accuracy = accuracy
                best_pipeline = pipeline
            end
            
            # print accuracy of pipeline
            print("\n accuracy: ", round(accuracy, digits=2), " by ", string(pipeline))
        catch
            continue
        end
    end
    return (best_accuracy, best_pipeline)
end

find_best_pipeline_with_bfs_search

In [78]:
# find the best pipeline and accuracy with depth 2
(best_accuracy, best_pipeline) = find_best_pipeline_with_bfs_search(pipeline, train_X, train_Y, test_X, test_Y, 2)

# print result
println("\n\nBest pipeline: ", round(best_accuracy, digits=2))
print(best_pipeline)


 accuracy: 0.93 by 

Pipeline(Tuple{Any, Any}[("DecisionTree", PyObject DecisionTreeClassifier())], Any[PyObject DecisionTreeClassifier()])
 accuracy: 0.93 by 

Pipeline(Tuple{Any, Any}[("RandomForest", PyObject RandomForestClassifier())], Any[PyObject RandomForestClassifier()])
 accuracy: 0.93 by Pipeline(Tuple{Any, Any}[("Gradient Boosting Classifier", PyObject GradientBoostingClassifier())], Any[PyObject GradientBoostingClassifier()])
 accuracy: 0.93 by Pipeline(Tuple{Any, Any}[("LogisticRegression", PyObject LogisticRegression())], Any[PyObject LogisticRegression()])

Best pipeline: 0.93
Pipeline(Tuple{Any, Any}[("DecisionTree", PyObject DecisionTreeClassifier())], Any[PyObject DecisionTreeClassifier()])