# Full notebook
This notebook has the whole process put together:
1. Loading dataset
2. Defining grammar
3. Searching through the grammar and evaluating candidate pipelines

## 0. Imports

In [None]:
# uncomment the following if not all packages are added.

# import Pkg
# using Pkg
# Pkg.add("HTTP")
# Pkg.add("JSON")
# Pkg.add("DataFrames")
# Pkg.add("OpenML")
# Pkg.add("DataFrames") 
# Pkg.add("CSV") 
# Pkg.add("Suppressor")
# Pkg.add("StatsBase")

In [None]:
using ScikitLearn
using ScikitLearn.Pipelines: Pipeline, FeatureUnion
using ScikitLearn.CrossValidation: cross_val_score
using XGBoost
using Revise
using Random
using Statistics: mean
using ExprRules: get_executable
using Suppressor
using Random
using Dates

include("./Herb.jl/src/Herb.jl")
include("./Herb.jl/HerbGrammar.jl/src/HerbGrammar.jl")
include("./Herb.jl/HerbData.jl/src/HerbData.jl")
include("./Herb.jl/HerbEvaluation.jl/src/HerbEvaluation.jl")
include("./Herb.jl/HerbConstraints.jl/src/HerbConstraints.jl")
include("./Herb.jl/HerbSearch.jl/src/HerbSearch.jl")
include("helper.jl")

In [None]:
# import the sk-learn functions
@sk_import decomposition: (PCA)
@sk_import preprocessing: (StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Binarizer, PolynomialFeatures)
@sk_import feature_selection: (VarianceThreshold, SelectKBest, SelectPercentile, SelectFwe, RFE)
@sk_import tree: (DecisionTreeClassifier)
@sk_import ensemble: (RandomForestClassifier, GradientBoostingClassifier)
@sk_import linear_model: (LogisticRegression)
@sk_import neighbors: (NearestNeighbors)
@sk_import svm: (LinearSVC)

## 1. Loading datasets

In [None]:
# Iris: 61, Seeds: 1499, Blood transfusion: 1464, Monks: 334, Diabetes: 37, ilpd: 1480

# load dataset
# dataset = get_dataset(61)

# it does not work for the seeds table dataset!
dataset = get_dataset(1499)

# does not work either on dataset 1464!

# shuffle the dataset
dataset_shuffled = dataset[shuffle(1:end), :]

# split into train and test sets (90:10)
split_index = floor(Int, size(dataset_shuffled, 1) * 0.90)
train_df = dataset_shuffled[1:split_index, :]
test_df = dataset_shuffled[split_index+1:end, :]

# show first five entries
first(dataset_shuffled, 5)

In [None]:
# show metadata
print("size: ", size(dataset_shuffled))
describe(dataset_shuffled)

In [None]:
# split into features and labels
train_X = train_df[:, 1:end-1]
train_Y = train_df[:, end]
test_X = test_df[:, 1:end-1]
test_Y = test_df[:, end]

# print ratio train/test
ratio = trunc(Int, 10.0 * (size(train_df)[1] / (size(train_df)[1] + size(test_df)[1])))
print("train:test ratio = ", ratio , ":", (10-ratio))

## 2. Defining grammar

In [None]:
grammar = Herb.HerbGrammar.@cfgrammar begin

    # this is the version with multiple classifiers possible
    # START   = CLASSIF | sequence(PRE, CLASSIF)
    # PRE     = PREPROC | FSELECT | sequence(PRE, PRE) | parallel(BRANCH, BRANCH)
    # BRANCH  = PRE | CLASSIF | sequence(PRE, CLASSIF) 

    # this is the version with only one classifier
    START   = Pipeline([CLASSIF]) | Pipeline([PRE, CLASSIF])
    PRE     = PREPROC | FSELECT | ("seq", Pipeline([PRE, PRE]))  | ("par", FeatureUnion([PRE, PRE])) 

    # preprocessing functions
    PREPROC =   
        ("StandardScaler" * string(rand(Int)), StandardScaler()) |
        ("RobustScaler", RobustScaler()) |
        ("MinMaxScaler", MinMaxScaler()) |
        ("MaxAbsScaler", MaxAbsScaler()) |
        ("PCA", PCA()) |
        ("Binarizer", Binarizer()) |
        ("PolynomialFeatures", PolynomialFeatures())

    # feature selection functions
    FSELECT =  
        ("VarianceThreshold", VarianceThreshold()) |
        # ("SelectKBest",  SelectKBest()) |
        ("SelectPercentile",  SelectPercentile()) |
        ("SelectFwe",  SelectFwe()) |
        ("Recursive Feature Elimination",  RFE(LinearSVC())) 

    # classifiers
    CLASSIF =
        ("DecisionTree", DecisionTreeClassifier()) |
        ("RandomForest", RandomForestClassifier()) |
        ("Gradient Boosting Classifier", GradientBoostingClassifier()) |
        ("LogisticRegression", LogisticRegression()) |
        ("NearestNeighborClassifier", NearestNeighbors())
end


In [None]:
function insert_name_indexes(p)
    p_start = ""
    i = 1
    while i <= 100
        try
            p = replace(p, """",""" => string(i)*"""",""", count=1)
            p_split = split(p, string(i) * """", """)
            p_start *= p_split[1] * string(i) * """", """
            p = p_split[2]
            i += 1
        catch
            break
        end
    end
    return split(p_start, string(i))[1]
end

In [None]:
function get_random_pipeline(grammar, max_depth, start_symbol)
    # all pipelines that can be assembled in max_depth steps
    cfe = Herb.HerbSearch.ContextFreeEnumerator(grammar, max_depth, :START)
    cfe_size = deepcopy(cfe)
    
    # find size
    size = 0
    for pipeline in cfe_size
        size += 1
    end

    # Find start program
    ret_pipeline = nothing
    c = 0
    i = abs(rand(Int) % size)
    for pipeline in cfe
        if c == i
            ret_pipeline = pipeline
            break
        end
        c = c + 1
    end
    return ret_pipeline
end

In [None]:
function vlns(grammar, enumeration_depth)
    current_program = get_random_pipeline(grammar, 4, :START)
    current_cost = 1.1
    i = 0
    not_improved_counter = 0
    max_seconds = 30
    max_time = Dates.Millisecond(max_seconds * 1000)
    t_start = now()
    while i < 100
        t_now = now()
        if t_now - t_start > max_time
            break
        end
        println("iteration: ", i)
        previous_cost = current_cost
        current_program, current_cost = find_best_neighbour_in_neighbourhood(current_program, grammar, enumeration_depth, t_start, max_time)

        if current_cost == previous_cost
            not_improved_counter += 1
            if not_improved_counter == 10
                println("stopping because hasn't inproved in 10 iterations")
                break
            end
        else
            not_improved_counter = 0
        end
        if current_cost == 0.0
            println("stopping because cost is 0.0")
            break
        end
        i += 1
    end

    println("final program: ", current_program)
    println("final cost: ", current_cost)
    return current_program, current_cost
end

In [None]:
function find_best_neighbour_in_neighbourhood(current_program, grammar, enumeration_depth, t_start, max_time)
    # println("Start vlns iteration")
    # println("current_program = ", current_program)
    # println("current_cost = ", pipeline_cost_function(eval(Herb.HerbSearch.rulenode2expr(current_program, grammar)), train_X, train_Y, test_X, test_Y))
    # 1. Construct neighbourhood
    neighbourhood_node_loc, dict = Herb.HerbSearch.constructNeighbourhoodRuleSubset(current_program, grammar)
    replacement_expressions = Herb.HerbSearch.enumerate_neighbours_propose(enumeration_depth)(current_program, 
                                                                                                neighbourhood_node_loc, 
                                                                                                grammar,
                                                                                                5, # max_depth = max depth of pipeline, depth of subprogram is bound by this
                                                                                                dict)

    # 2. Find best neighbour
    best_program = deepcopy(current_program)
    pipeline = eval(Meta.parse(insert_name_indexes(string(Herb.HerbSearch.rulenode2expr(best_program, grammar)))))
    best_program_cost = pipeline_cost_function(pipeline, train_X, train_Y, test_X, test_Y)
    possible_program = current_program
    neighbours_tried = 0
    for replacement_expression in replacement_expressions
        t_now = now()
        if t_now - t_start > max_time
            println("timelimit reached")
            break
        end
        if (neighbours_tried % 10) == 0
            println("tried ", neighbours_tried, " neighbours")
            if neighbours_tried == 30
                break
            end
        end
        # change current_program to one of its neighbours 
        if neighbourhood_node_loc.i == 0
            possible_program = replacement_expression
        else
            neighbourhood_node_loc.parent.children[neighbourhood_node_loc.i] = replacement_expression
        end
        pipeline = eval(Meta.parse(insert_name_indexes(string(Herb.HerbSearch.rulenode2expr(possible_program, grammar)))))
        possible_program_cost = pipeline_cost_function(pipeline, train_X, train_Y, test_X, test_Y)
        # println("possible pipeline: ", possible_program)#Herb.HerbSearch.rulenode2expr(pl, grammar)) 
        # println("possible pipeline cost: ", possible_program_cost)
        if possible_program_cost < best_program_cost
            # println("Found a better pipeline!: ", possible_program)
            best_program = deepcopy(current_program)
            best_program_cost = possible_program_cost        
        end
        neighbours_tried += 1
    end
    # println("Finished vlns iteration")
    # println("best_program = ", best_program)
    # println("best_cost = ", best_program_cost)
    # println()
    return best_program, best_program_cost
end

In [None]:
vlns(grammar, 3) # enumeration_depth = max depth of subprogram that is being replaced, regardless of the NodeLoc

## 3. Search

In [None]:
"""
Fits the pipeline to the training set and measures the accuracy on test set.
input:  pipeline, train_X, train_Y, test_X, test_Y
output: accuracy of pipeline
"""
function evaluate_pipeline(pipeline, train_X, train_Y, test_X, test_Y)

    # # this gives the following warning often, so it is suppressed for now.
    # # ConvergenceWarning: lbfgs failed to converge
    @suppress_err begin
        try
            # fit the pipeline
            # print(pipeline)
            # print(" - ")
            model = ScikitLearn.fit!(pipeline, Matrix(train_X), Array(train_Y))

            # make predictions
            predictions = ScikitLearn.predict(model, Matrix(test_X))

            # measure the accuracy
            accuracy = mean(predictions .== test_Y)
            return accuracy
        catch e
            # println("Caught error [in evaluate_pipeline()]: ", e)
            return 0.0
        end
    end
end

In [None]:
"""Trains the pipeline and returns 1-accuracy"""
function pipeline_cost_function(pipeline, train_X, train_Y, test_X, test_Y)
    return 1.0 - evaluate_pipeline(pipeline, train_X, train_Y, test_X, test_Y)
end

In [None]:
"""This function enumerates the grammar and finds the best pipeline. """
function find_best_pipeline_with_bfs_search(grammar, train_X, train_Y, test_X, test_Y, search_depth)
    best_accuracy = -1.0
    best_pipeline = nothing

    # enumerate the gramamar
    enumerator = Herb.HerbSearch.ContextFreeEnumerator(grammar, search_depth, :START)
    for rule in enumerator
        try
            # get pipeline and calculate accuracy
            pipeline = eval(Herb.HerbSearch.rulenode2expr(rule, grammar))
            accuracy = evaluate_pipeline(pipeline, train_X, train_Y, test_X, test_Y)

            # update best pipeline
            if (accuracy > best_accuracy) 
                best_accuracy = accuracy
                best_pipeline = pipeline
            end
            
            # print accuracy of pipeline
            print("\n accuracy: ", round(accuracy, digits=2), " by ", string(pipeline))
        catch
            continue
        end
    end
    return (best_accuracy, best_pipeline)
end

In [None]:
# find the best pipeline and accuracy with depth 2
(best_accuracy, best_pipeline) = find_best_pipeline_with_bfs_search(grammar, train_X, train_Y, test_X, test_Y, 2)

# print result
println("\n\nBest pipeline: ", round(best_accuracy, digits=2))
print(best_pipeline)

In [None]:
test_pipeline = Pipeline([("DecisionTree", DecisionTreeClassifier())])
pipeline_cost_function(test_pipeline, train_X, train_Y, test_X, test_Y)

In [None]:
"""This function enumerates the grammar and finds the best pipeline. """
function find_best_pipeline_with_vlsn(grammar, train_X, train_Y, test_X, test_Y, search_depth)
    best_accuracy = -1.0
    best_pipeline = nothing

    # enumerate the gramamar
    # enumerator = Herb.HerbSearch.ContextFreeEnumerator(grammar, search_depth, :START)
    enumerator = Herb.HerbSearch.get_vlsn_enumerator(grammar, [], search_depth, :START, pipeline_cost_function)
    for rule in enumerator
        try
            # get pipeline and calculate accuracy
            pipeline = eval(Herb.HerbSearch.rulenode2expr(rule, grammar))
            accuracy = evaluate_pipeline(pipeline, train_X, train_Y, test_X, test_Y)

            # update best pipeline
            if (accuracy > best_accuracy) 
                best_accuracy = accuracy
                best_pipeline = pipeline
            end
            
            # print accuracy of pipeline
            print("\n accuracy: ", round(accuracy, digits=2), " by ", string(pipeline))
        catch
            continue
        end
    end
    return (best_accuracy, best_pipeline)
end

In [None]:
find_best_pipeline_with_vlsn(grammar, train_X, train_Y, test_X, test_Y, 2)

In [None]:
function p_cost_f(pipeline)
    # pipeline = eval(pipeline)
    return pipeline_cost_function(pipeline, train_X, train_Y, test_X, test_Y)
end

enumerator = Herb.HerbSearch.get_vlsn_enumerator(grammar, [], 2, :START, p_cost_f)
# Herb.HerbSearch.get_mh_enumerator()
c = 0
for rule in enumerator
    println(Herb.HerbSearch.rulenode2expr(rule, grammar))
    c = c + 1
    if c == 20
        break
    end
end