# Full notebook
This notebook has the whole process put together:
1. Loading dataset
2. Defining grammar
3. Searching through the grammar and evaluating candidate pipelines

## 0. Imports

In [3]:
# uncomment the following if not all packages are added.

# import Pkg
# using Pkg
# Pkg.add("HTTP")
# Pkg.add("JSON")
# Pkg.add("DataFrames")
# Pkg.add("OpenML")
# Pkg.add("DataFrames") 
# Pkg.add("CSV") 
# Pkg.add("Suppressor")
# Pkg.add("StatsBase")

In [1]:
using ScikitLearn
using ScikitLearn.Pipelines: Pipeline, FeatureUnion
using ScikitLearn.CrossValidation: cross_val_score
using XGBoost
using Revise
using Random
using Statistics: mean
using ExprRules: get_executable
using Suppressor

include("./Herb.jl/src/Herb.jl")
include("./Herb.jl/HerbGrammar.jl/src/HerbGrammar.jl")
include("./Herb.jl/HerbData.jl/src/HerbData.jl")
include("./Herb.jl/HerbEvaluation.jl/src/HerbEvaluation.jl")
include("./Herb.jl/HerbConstraints.jl/src/HerbConstraints.jl")
include("./Herb.jl/HerbSearch.jl/src/HerbSearch.jl")
include("helper.jl")

get_class_type_dataset (generic function with 1 method)

In [2]:
# import the sk-learn functions
@sk_import decomposition: (PCA)
@sk_import preprocessing: (StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Binarizer, PolynomialFeatures)
@sk_import feature_selection: (VarianceThreshold, SelectKBest, SelectPercentile, SelectFwe, RFE)
@sk_import tree: (DecisionTreeClassifier)
@sk_import ensemble: (RandomForestClassifier, GradientBoostingClassifier)
@sk_import linear_model: (LogisticRegression)
@sk_import neighbors: (NearestNeighbors)
@sk_import svm: (LinearSVC)

PyObject <class 'sklearn.svm._classes.LinearSVC'>

## 1. Loading datasets

In [31]:
# load dataset
dataset = get_dataset(61)

# it does not work for the seeds table dataset!
# dataset = get_dataset(1499)

# does not work either on dataset 1464!

# shuffle the dataset
dataset_shuffled = dataset[shuffle(1:end), :]

# split into train and test sets (90:10)
split_index = floor(Int, size(dataset_shuffled, 1) * 0.90)
train_df = dataset_shuffled[1:split_index, :]
test_df = dataset_shuffled[split_index+1:end, :]

# show first five entries
first(dataset_shuffled, 5)

Row,sepallength,sepalwidth,petallength,petalwidth,class
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Cat…
1,6.2,2.9,4.3,1.3,Iris-versicolor
2,5.4,3.4,1.5,0.4,Iris-setosa
3,5.2,2.7,3.9,1.4,Iris-versicolor
4,5.6,3.0,4.1,1.3,Iris-versicolor
5,6.3,3.3,4.7,1.6,Iris-versicolor


In [32]:
# show metadata
print("size: ", size(dataset_shuffled))
describe(dataset_shuffled)

size: (150, 5)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,sepallength,5.84333,4.3,5.8,7.9,0,Float64
2,sepalwidth,3.054,2.0,3.0,4.4,0,Float64
3,petallength,3.75867,1.0,4.35,6.9,0,Float64
4,petalwidth,1.19867,0.1,1.3,2.5,0,Float64
5,class,,Iris-setosa,,Iris-virginica,0,"CategoricalValue{String, UInt32}"


In [33]:
# split into features and labels
train_X = train_df[:, 1:end-1]
train_Y = train_df[:, end]
test_X = test_df[:, 1:end-1]
test_Y = test_df[:, end]

# print ratio train/test
ratio = trunc(Int, 10.0 * (size(train_df)[1] / (size(train_df)[1] + size(test_df)[1])))
print("train:test ratio = ", ratio , ":", (10-ratio))

train:test ratio = 2:8

## 2. Defining grammar

In [34]:
grammar = Herb.HerbGrammar.@cfgrammar begin

    # this is the version with multiple classifiers possible
    # START   = CLASSIF | sequence(PRE, CLASSIF)
    # PRE     = PREPROC | FSELECT | sequence(PRE, PRE) | parallel(BRANCH, BRANCH)
    # BRANCH  = PRE | CLASSIF | sequence(PRE, CLASSIF) 

    # this is the version with only one classifier
    START   = Pipeline([CLASSIF]) | Pipeline([PRE, CLASSIF])
    PRE     = PREPROC | FSELECT | ("seq", Pipeline([PRE, PRE]))  | ("par", FeatureUnion([PRE, PRE])) 

    # preprocessing functions
    PREPROC =   
        ("StandardScaler", StandardScaler()) |
        ("RobustScaler", RobustScaler()) |
        ("MinMaxScaler", MinMaxScaler()) |
        ("MaxAbsScaler", MaxAbsScaler()) |
        ("PCA", PCA()) |
        ("Binarizer", Binarizer()) |
        ("PolynomialFeatures", PolynomialFeatures())

    # feature selection functions
    FSELECT =  
        ("VarianceThreshold", VarianceThreshold()) |
        # ("SelectKBest",  SelectKBest()) |
        ("SelectPercentile",  SelectPercentile()) |
        ("SelectFwe",  SelectFwe()) |
        ("Recursive Feature Elimination",  RFE(LinearSVC())) 

    # classifiers
    CLASSIF =
        ("DecisionTree", DecisionTreeClassifier()) |
        ("RandomForest", RandomForestClassifier()) |
        ("Gradient Boosting Classifier", GradientBoostingClassifier()) |
        ("LogisticRegression", LogisticRegression()) |
        ("NearestNeighborClassifier", NearestNeighbors())
end


1: START = Pipeline([CLASSIF])
2: START = Pipeline([PRE, CLASSIF])
3: PRE = PREPROC
4: PRE = FSELECT
5: PRE = ("seq", Pipeline([PRE, PRE]))
6: PRE = ("par", FeatureUnion([PRE, PRE]))
7: PREPROC = ("StandardScaler", StandardScaler())
8: PREPROC = ("RobustScaler", RobustScaler())
9: PREPROC = ("MinMaxScaler", MinMaxScaler())
10: PREPROC = ("MaxAbsScaler", MaxAbsScaler())
11: PREPROC = ("PCA", PCA())
12: PREPROC = ("Binarizer", Binarizer())
13: PREPROC = ("PolynomialFeatures", PolynomialFeatures())
14: FSELECT = ("VarianceThreshold", VarianceThreshold())
15: FSELECT = ("SelectPercentile", SelectPercentile())
16: FSELECT = ("SelectFwe", SelectFwe())
17: FSELECT = ("Recursive Feature Elimination", RFE(LinearSVC()))
18: CLASSIF = ("DecisionTree", DecisionTreeClassifier())
19: CLASSIF = ("RandomForest", RandomForestClassifier())
20: CLASSIF = ("Gradient Boosting Classifier", GradientBoostingClassifier())
21: CLASSIF = ("LogisticRegression", LogisticRegression())
22: CLASSIF = ("NearestNeighbo

In [43]:
# print out all pipelines that can be assembled in two steps
cfe_print = Herb.HerbSearch.ContextFreeEnumerator(grammar, 4, :START)

# Start programm vinden
pl = nothing
c = 0
for rule in cfe_print
    c = c + 1
    if c == 51
        pl = rule
        println(Herb.HerbSearch.rulenode2expr(rule, grammar))
        println(pl)
        break
    end
end
pl1 = pl
print(pl1)

Pipeline([("SelectFwe", SelectFwe()), ("DecisionTree", DecisionTreeClassifier())])
2{4{16}18}
2{4{16}18}

In [44]:
function vlns(grammar, enumeration_depth)
    current_program = pl1
    i = 0
    while i < 5
        current_program, current_cost = find_best_neighbour_in_neighbourhood(current_program, grammar, enumeration_depth)
        i += 1
    end
    current_program_cost = pipeline_cost_function(eval(Herb.HerbSearch.rulenode2expr(current_program, grammar)), train_X, train_Y, test_X, test_Y)
    println("final program: ", current_program)
    println("final cost: ", current_program_cost)
    return current_program, current_program_cost
end
    

function find_best_neighbour_in_neighbourhood(current_program, grammar, enumeration_depth)
    println("Start vlns iteration")
    println("current_program = ", current_program)
    println("current_cost = ", pipeline_cost_function(eval(Herb.HerbSearch.rulenode2expr(current_program, grammar)), train_X, train_Y, test_X, test_Y))
    # 1. Construct neighbourhood
    neighbourhood_node_loc, dict = Herb.HerbSearch.constructNeighbourhoodRuleSubset(current_program, grammar)
    replacement_expressions = Herb.HerbSearch.enumerate_neighbours_propose(enumeration_depth)(current_program, 
                                                                                                neighbourhood_node_loc, 
                                                                                                grammar,
                                                                                                3, 
                                                                                                dict)
    for replacement_expression in replacement_expressions
        # println("replacement expression: ", replacement_expression)
    end
    # 2. Find best neighbour
    best_program = deepcopy(current_program)
    best_program_cost = pipeline_cost_function(eval(Herb.HerbSearch.rulenode2expr(best_program, grammar)), train_X, train_Y, test_X, test_Y)
    possible_program = current_program
    for replacement_expression in replacement_expressions
        # change current_program to one of its neighbours 
        if neighbourhood_node_loc.i == 0
            possible_program = replacement_expression
        else
            neighbourhood_node_loc.parent.children[neighbourhood_node_loc.i] = replacement_expression
        end
        possible_program_cost = pipeline_cost_function(eval(Herb.HerbSearch.rulenode2expr(possible_program, grammar)), train_X, train_Y, test_X, test_Y)
        println("possible pipeline: ", possible_program)#Herb.HerbSearch.rulenode2expr(pl, grammar)) 
        println("possible pipeline cost: ", possible_program_cost)
        if possible_program_cost < best_program_cost
            println("Found a better pipeline!: ", possible_program)
            best_program = deepcopy(current_program)
            best_program_cost = possible_program_cost        
        end
    end
    println("Finished vlns iteration")
    println("best_program = ", best_program)
    println("best_cost = ", best_program_cost)
    println()
    return best_program, best_program_cost
end

find_best_neighbour_in_neighbourhood (generic function with 1 method)

In [45]:
vlns(grammar, 2)

Start vlns iteration
current_program = 2{4{16}18}
current_cost = 0.050000000000000044
possible pipeline: 2{4{17}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{13}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{10}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{9}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{8}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{7}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{12}18}
possible pipeline cost: 0.6916666666666667
possible pipeline: 2{4{16}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{11}18}
possible pipeline cost: 0.1166666666666667
possible pipeline: 2{4{15}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{4{14}18}
possible pipeline cost: 0.050000000000000044
Finished vlns iteration
best_program = 2{4{16}18}
best_cost = 0.050000000000000044

Start vlns i



possible pipeline: 2{4{17}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{13}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{10}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{9}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{8}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{7}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{12}18}
possible pipeline cost: 0.6916666666666667
possible pipeline: 2{4{16}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{11}18}
possible pipeline cost: 0.17500000000000004
possible pipeline: 2{4{15}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{4{14}18}
possible pipeline cost: 0.050000000000000044
Finished vlns iteration
best_program = 2{4{16}18}
best_cost = 0.050000000000000044

Start vlns iteration
current_program = 2{4{16}18}
current_cost = 0.050000000000000044
Caught erro




possible pipeline: 1{19}
possible pipeline cost: 0.050000000000000044
possible pipeline: 1{18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 1{21}
possible pipeline cost: 0.050000000000000044


possible pipeline: 1{20}
possible pipeline cost: 0.050000000000000044
Finished vlns iteration
best_program = 2{4{16}18}
best_cost = 0.050000000000000044

Start vlns iteration
current_program = 2{4{16}18}
current_cost = 0.050000000000000044
possible pipeline: 2{4{17}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{13}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{10}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{9}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{8}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{7}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{12}18}
possible pipeline cost: 0.6916666666666667
possible pipeline: 2{4{16}18}
possible pipeline cost: 0.050000000000000044
possible pipeline: 2{3{11}18}
possible pipeline cost: 0.1166666666666667
possible pipeline: 2{4{15}18}
possible pipeline cost: 0.050000000000000044
possible pipeline



(2{4{16}18}, 0.050000000000000044)

## 3. Search

In [25]:
"""
Fits the pipeline to the training set and measures the accuracy on test set.
input:  pipeline, train_X, train_Y, test_X, test_Y
output: accuracy of pipeline
"""
function evaluate_pipeline(pipeline, train_X, train_Y, test_X, test_Y)

    # # this gives the following warning often, so it is suppressed for now.
    # # ConvergenceWarning: lbfgs failed to converge
    # @suppress_err begin
    try
        # fit the pipeline
        # print(pipeline)
        # print(" - ")
        model = ScikitLearn.fit!(pipeline, Matrix(train_X), Array(train_Y))

        # make predictions
        predictions = ScikitLearn.predict(model, Matrix(test_X))

        # measure the accuracy
        accuracy = mean(predictions .== test_Y)
        return accuracy
    catch e
        println("Caught error [in evaluate_pipeline()]: ", e)
        return 0.0
    end
end

evaluate_pipeline

In [10]:
"""Trains the pipeline and returns 1-accuracy"""
function pipeline_cost_function(pipeline, train_X, train_Y, test_X, test_Y)
    return 1.0 - evaluate_pipeline(pipeline, train_X, train_Y, test_X, test_Y)
end

pipeline_cost_function

In [11]:
"""This function enumerates the grammar and finds the best pipeline. """
function find_best_pipeline_with_bfs_search(grammar, train_X, train_Y, test_X, test_Y, search_depth)
    best_accuracy = -1.0
    best_pipeline = nothing

    # enumerate the gramamar
    enumerator = Herb.HerbSearch.ContextFreeEnumerator(grammar, search_depth, :START)
    for rule in enumerator
        try
            # get pipeline and calculate accuracy
            pipeline = eval(Herb.HerbSearch.rulenode2expr(rule, grammar))
            accuracy = evaluate_pipeline(pipeline, train_X, train_Y, test_X, test_Y)

            # update best pipeline
            if (accuracy > best_accuracy) 
                best_accuracy = accuracy
                best_pipeline = pipeline
            end
            
            # print accuracy of pipeline
            print("\n accuracy: ", round(accuracy, digits=2), " by ", string(pipeline))
        catch
            continue
        end
    end
    return (best_accuracy, best_pipeline)
end

find_best_pipeline_with_bfs_search

In [None]:
# find the best pipeline and accuracy with depth 2
(best_accuracy, best_pipeline) = find_best_pipeline_with_bfs_search(grammar, train_X, train_Y, test_X, test_Y, 2)

# print result
println("\n\nBest pipeline: ", round(best_accuracy, digits=2))
print(best_pipeline)

In [None]:
test_pipeline = Pipeline([("DecisionTree", DecisionTreeClassifier())])
pipeline_cost_function(test_pipeline, train_X, train_Y, test_X, test_Y)

In [None]:
"""This function enumerates the grammar and finds the best pipeline. """
function find_best_pipeline_with_vlsn(grammar, train_X, train_Y, test_X, test_Y, search_depth)
    best_accuracy = -1.0
    best_pipeline = nothing

    # enumerate the gramamar
    # enumerator = Herb.HerbSearch.ContextFreeEnumerator(grammar, search_depth, :START)
    enumerator = Herb.HerbSearch.get_vlsn_enumerator(grammar, [], search_depth, :START, pipeline_cost_function)
    for rule in enumerator
        try
            # get pipeline and calculate accuracy
            pipeline = eval(Herb.HerbSearch.rulenode2expr(rule, grammar))
            accuracy = evaluate_pipeline(pipeline, train_X, train_Y, test_X, test_Y)

            # update best pipeline
            if (accuracy > best_accuracy) 
                best_accuracy = accuracy
                best_pipeline = pipeline
            end
            
            # print accuracy of pipeline
            print("\n accuracy: ", round(accuracy, digits=2), " by ", string(pipeline))
        catch
            continue
        end
    end
    return (best_accuracy, best_pipeline)
end

In [None]:
find_best_pipeline_with_vlsn(grammar, train_X, train_Y, test_X, test_Y, 2)

In [None]:
function p_cost_f(pipeline)
    # pipeline = eval(pipeline)
    return pipeline_cost_function(pipeline, train_X, train_Y, test_X, test_Y)
end

enumerator = Herb.HerbSearch.get_vlsn_enumerator(grammar, [], 2, :START, p_cost_f)
# Herb.HerbSearch.get_mh_enumerator()
c = 0
for rule in enumerator
    println(Herb.HerbSearch.rulenode2expr(rule, grammar))
    c = c + 1
    if c == 20
        break
    end
end