In [1]:
using ScikitLearn
using ScikitLearn.Pipelines: Pipeline, FeatureUnion
using XGBoost
using Revise
using Random
using Statistics: mean
include("helper.jl")
include("lib/Herb.jl/src/Herb.jl")

@sk_import decomposition: (PCA)
@sk_import preprocessing: (StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Binarizer, PolynomialFeatures)
@sk_import feature_selection: (VarianceThreshold, SelectKBest, SelectPercentile, SelectFwe, RFE)
@sk_import tree: (DecisionTreeClassifier)
@sk_import ensemble: (RandomForestClassifier, GradientBoostingClassifier)
@sk_import linear_model: (LogisticRegression)
@sk_import neighbors: (NearestNeighbors)
@sk_import svm: (LinearSVC)

PyObject <class 'sklearn.svm._classes.LinearSVC'>

In [2]:
sequence(a, b) = Pipeline([a, b]) 
parallel(a, b) = FeatureUnion([a, b]) 

g = Herb.HerbGrammar.@cfgrammar begin
    # START   = CLASSIF | sequence(PRE, CLASSIF)
    # PRE     = PREPROC | FSELECT | sequence(PRE, PRE) | parallel(BRANCH, BRANCH)
    # BRANCH  = PRE | CLASSIF | sequence(PRE, CLASSIF) 

    START   = Pipeline([CLASSIF]) | Pipeline([PRE, CLASSIF])
    PRE     = PREPROC | FSELECT | ("seq", Pipeline([PRE, PRE]))  | ("par", FeatureUnion([PRE, PRE])) 
    # BRANCH  = PRE | CLASSIF | Pipeline([PRE, CLASSIF]) 

    PREPROC =   
        ("StandardScaler", StandardScaler()) |
        ("RobustScaler", RobustScaler()) |
        ("MinMaxScaler", MinMaxScaler()) |
        ("MaxAbsScaler", MaxAbsScaler()) |
        ("PCA", PCA()) |
        ("Binarizer", Binarizer()) |
        ("PolynomialFeatures", PolynomialFeatures())
    FSELECT =  
        ("VarianceThreshold", VarianceThreshold()) |
        ("SelectKBest",  SelectKBest()) |
        ("SelectPercentile",  SelectPercentile()) |
        ("SelectFwe",  SelectFwe()) |
        ("Recursive Feature Elimination",  RFE(LinearSVC())) 
    CLASSIF =
        ("DecisionTree", DecisionTreeClassifier()) |
        ("RandomForest", RandomForestClassifier()) |
        ("Gradient Boosting Classifier", GradientBoostingClassifier()) |
        ("LogisticRegression", LogisticRegression()) |
        ("NearestNeighborClassifier", NearestNeighbors())
end

1: START = Pipeline([CLASSIF])
2: START = Pipeline([PRE, CLASSIF])
3: PRE = PREPROC
4: PRE = FSELECT
5: PRE = ("seq", Pipeline([PRE, PRE]))
6: PRE = ("par", FeatureUnion([PRE, PRE]))
7: PREPROC = ("StandardScaler", StandardScaler())
8: PREPROC = ("RobustScaler", RobustScaler())
9: PREPROC = ("MinMaxScaler", MinMaxScaler())
10: PREPROC = ("MaxAbsScaler", MaxAbsScaler())
11: PREPROC = ("PCA", PCA())
12: PREPROC = ("Binarizer", Binarizer())
13: PREPROC = ("PolynomialFeatures", PolynomialFeatures())
14: FSELECT = ("VarianceThreshold", VarianceThreshold())
15: FSELECT = ("SelectKBest", SelectKBest())
16: FSELECT = ("SelectPercentile", SelectPercentile())
17: FSELECT = ("SelectFwe", SelectFwe())
18: FSELECT = ("Recursive Feature Elimination", RFE(LinearSVC()))
19: CLASSIF = ("DecisionTree", DecisionTreeClassifier())
20: CLASSIF = ("RandomForest", RandomForestClassifier())
21: CLASSIF = ("Gradient Boosting Classifier", GradientBoostingClassifier())
22: CLASSIF = ("LogisticRegression", Logisti

In [3]:
function get_accuracy(pipeline, dataset)

    dataset_shuffled = dataset[shuffle(1:end), :]

    # Calculate the split index
    split_index = floor(Int, size(dataset_shuffled, 1) * 0.9)

    # Split the data
    train_df = dataset_shuffled[1:split_index, :]
    test_df = dataset_shuffled[split_index+1:end, :]

    # Get the train and test features and labels
    train_features = train_df[:, 1:end-1]
    train_labels = train_df[:, end]
    test_features = test_df[:, 1:end-1]
    test_labels = test_df[:, end]
    try
        # Fit the pipeline
        ScikitLearn.fit!(pipeline, Matrix(train_features), Array(train_labels))

    
        predictions = ScikitLearn.predict(pipeline, Matrix(test_features))

        accuracy = mean(predictions .== test_labels)

        return accuracy
    catch
        return 0
    end
end

get_accuracy (generic function with 1 method)

In [7]:
using ExprRules
cfe = Herb.HerbSearch.ContextFreeEnumerator(g, 3, :START)

max = -1 

dataset = get_dataset(1464)
max_pipeline = nothing

for rule in cfe
    try
        pipeline = eval(Herb.HerbSearch.rulenode2expr(rule, g))

        accuracy = get_accuracy(pipeline, dataset)
        if accuracy >= max
            max = accuracy
            max_pipeline = pipeline
            println("Accuracy: ", accuracy)
            println(pipeline)
        end
    catch
        continue
    end
    
end

Accuracy: 0.76
Pipeline(Tuple{Any, Any}[("DecisionTree", PyObject DecisionTreeClassifier())], Any[PyObject DecisionTreeClassifier()])


Accuracy: 0.7866666666666666
Pipeline(Tuple{Any, Any}[("Gradient Boosting Classifier", PyObject GradientBoostingClassifier())], Any[PyObject GradientBoostingClassifier()])


Accuracy: 0.8266666666666667
Pipeline(Tuple{Any, Any}[("StandardScaler", PyObject StandardScaler()), ("Gradient Boosting Classifier", PyObject GradientBoostingClassifier())], Any[PyObject StandardScaler(), PyObject GradientBoostingClassifier()])


Accuracy: 0.8266666666666667
Pipeline(Tuple{Any, Any}[("RobustScaler", PyObject RobustScaler()), ("LogisticRegression", PyObject LogisticRegression())], Any[PyObject RobustScaler(), PyObject LogisticRegression()])


Accuracy: 0.8533333333333334
Pipeline(Tuple{Any, Any}[("MaxAbsScaler", PyObject MaxAbsScaler()), ("LogisticRegression", PyObject LogisticRegression())], Any[PyObject MaxAbsScaler(), PyObject LogisticRegression()])




In [9]:
max, max_pipeline

(0.8533333333333334, Pipeline(Tuple{Any, Any}[("MaxAbsScaler", PyObject MaxAbsScaler()), ("LogisticRegression", PyObject LogisticRegression())], Any[PyObject MaxAbsScaler(), PyObject LogisticRegression()]))