# Using AI planning to explore data science pipelines

In [1]:
from __future__ import print_function
import sys
import os
import types

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../grammar2lale")))

# Clean output directory where we store planning and result files
os.system('rm -rf ../output')
os.system('mkdir -p ../output')


0

## 1. Start with a Data Science grammar, in EBNF format

In [2]:
# This is the grammar file we will use
GRAMMAR_FILE="../grammar/dsgrammar-subset-sklearn.bnf"

# Copy grammar to the output directory
os.system("cp " + GRAMMAR_FILE + " ../output/")

0

## 2. Convert the grammar into an HTN domain and problem

In [3]:
from grammar2lale import Grammar2Lale

# Generate HTN specifications
G2L = Grammar2Lale(grammar_file=GRAMMAR_FILE)
with open("../output/domain.htn", "w") as f:
    f.write(G2L.htn_domain);
with open("../output/problem.htn", "w") as f:
    f.write(G2L.htn_problem);


Generating HTN specification from grammar
Printing HTN domain


## 3. Use https://github.com/ronwalf/HTN-Translation to translate to a PDDL task

In [5]:
import re
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

# as a safety step, setting costs to 0 for any parts of the grammar that are non-identifiers (e.g., parens, etc.)
for token in G2L.htn.mapping:
    if not re.match('^[_a-zA-Z]', str(token)):
        G2L.costs[token] = 0
        
# prepare the list of possible constraints
constraint_options = G2L.get_selectable_constraints()
constraint_options.sort()    

# prepare a constraint selection form
interact_pipeline_params=interact.options(manual=True, manual_name='Generate PDDL')


pipelines = []
NUM_PIPELINES = 10
CONSTRAINTS = []


# This is the function that handles the constraint selection
@interact_pipeline_params(num_pipelines=widgets.IntSlider(value=10, min=1, max=100), 
                          constraints=widgets.SelectMultiple(options=constraint_options,
                                           description='Search constraints',
                                           rows=min(20, len(constraint_options))))
def select_pipeline_gen_params(num_pipelines, constraints):
    global pipelines
    global NUM_PIPELINES
    global CONSTRAINTS
    NUM_PIPELINES = num_pipelines
    CONSTRAINTS = list(constraints)
    G2L.create_pddl_task(NUM_PIPELINES, CONSTRAINTS)
    with open("../output/domain.pddl", "w") as f:
        f.write(G2L.last_task['domain'])
    with open("../output/problem.pddl", "w") as f:
        f.write(G2L.last_task['problem'])


interactive(children=(IntSlider(value=10, description='num_pipelines', min=1), SelectMultiple(description='Sea…

## 4. Use a planner to solve the planning task (in this case, kstar: https://github.com/ctpelok77/kstar)

In [6]:
import json

G2L.run_pddl_planner()
with open("../output/first_planner_call.json", "w") as f:
    f.write(json.dumps(G2L.last_planner_object, indent=3))

Running the planner...
Created domain file in /tmp/cc221326-bdf8-4b8b-a460-d491a6bfaf1d/domain.pddl
Created problem file in /tmp/cc221326-bdf8-4b8b-a460-d491a6bfaf1d/problem.pddl
Running kstar /tmp/cc221326-bdf8-4b8b-a460-d491a6bfaf1d/domain.pddl /tmp/cc221326-bdf8-4b8b-a460-d491a6bfaf1d/problem.pddl --search "kstar(blind(),k=50,json_file_to_dump=result.json)"
Plans returned after 0.6571543216705322 seconds.


## 5. Translate plans to LALE (https://github.com/IBM/lale) Data Science pipelines

In [7]:
# Translate to pipelines
pipelines = G2L.translate_to_pipelines(NUM_PIPELINES)

from pipeline_optimizer import PipelineOptimizer
from sklearn.datasets import load_iris

from lale.helpers import to_graphviz
from lale.lib.sklearn import *
from lale.lib.lale import ConcatFeatures as Concat
from lale.lib.lale import NoOp
from lale.lib.sklearn import KNeighborsClassifier as KNN
from lale.lib.sklearn import OneHotEncoder as OneHotEnc
from lale.lib.sklearn import Nystroem
from lale.lib.sklearn import PCA

optimizer = PipelineOptimizer(load_iris(return_X_y=True))
# instantiate LALE objects from pipeline definitions
LALE_pipelines = [optimizer.to_lale_pipeline(p) for p in pipelines]

# Display selected pipeline
def show_pipeline(pipeline):
    print("Displaying pipeline " + pipeline['id'] + ", with cost " + str(pipeline['score']))
    print(pipeline['pipeline'])
    print('==================================================================================')
    print()
    print()
    print()
    display(to_graphviz(pipeline['lale_pipeline']))

display_pipelines = [[p['pipeline'], p] for p in LALE_pipelines]    
    
interact(show_pipeline, pipeline=display_pipelines)

Translating plans to LALE pipelines.


interactive(children=(Dropdown(description='pipeline', options=(['( NoOp() & ( Normalizer() ) ) >> Concat() >>…

<function __main__.show_pipeline(pipeline)>

## 6. Run one of the pipelines on sample data

## 7. Train hyperparameters and evaluate the resulting LALE pipelines

In [8]:
trained_pipelines, dropped_pipelines = optimizer.evaluate_and_train_pipelines(pipelines)

Plan 1/10
Starting to optimize ( NoOp() & ( Normalizer() ) ) >> Concat() >> LogisticRegression()
100%|██████████| 20/20 [00:03<00:00,  5.53trial/s, best loss: -0.97]
Fit completed.
Predict completed.
Best accuracy: 0.98
Completed optimization for ( NoOp() & ( Normalizer() ) ) >> Concat() >> LogisticRegression()
Plan 2/10
Starting to optimize ( NoOp() & ( Normalizer() ) ) >> Concat() >> QuadraticDiscriminantAnalysis()
100%|██████████| 20/20 [00:02<00:00,  7.04trial/s, best loss: -0.97]              
Fit completed.
Predict completed.
Best accuracy: 0.98
Completed optimization for ( NoOp() & ( Normalizer() ) ) >> Concat() >> QuadraticDiscriminantAnalysis()
Plan 3/10
Starting to optimize ( NoOp() & ( RobustScaler() ) ) >> Concat() >> QuadraticDiscriminantAnalysis()
100%|██████████| 20/20 [00:02<00:00,  6.67trial/s, best loss: -0.97]             
Fit completed.
Predict completed.
Best accuracy: 0.98
Completed optimization for ( NoOp() & ( RobustScaler() ) ) >> Concat() >> QuadraticDiscrimin

In [9]:
from IPython.display import HTML
from tabulate import tabulate
from lale.pretty_print import to_string

def show_pipeline_accuracy(tp):
    pipeline_table = [[to_string(p['trained_pipeline']).replace('\n', '<br/>'), str(p['best_accuracy'])] for p in tp]
    display(HTML(tabulate(pipeline_table, headers=['Pipeline', 'Accuracy'], tablefmt='html')))


show_pipeline_accuracy(trained_pipelines)

Pipeline,Accuracy
"from lale.lib.lale import Hyperopt from lale.lib.lale import NoOp from sklearn.preprocessing import Normalizer from lale.lib.lale import ConcatFeatures as Concat from sklearn.linear_model import LogisticRegression import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=(NoOp() & Normalizer()) >> Concat() >> LogisticRegression(),  max_evals=20,  scoring=""r2"", )",0.98
"from lale.lib.lale import Hyperopt from lale.lib.lale import NoOp from sklearn.preprocessing import Normalizer from lale.lib.lale import ConcatFeatures as Concat from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=(NoOp() & Normalizer())  >> Concat()  >> QuadraticDiscriminantAnalysis(),  max_evals=20,  scoring=""r2"", )",0.98
"from lale.lib.lale import Hyperopt from lale.lib.lale import NoOp from sklearn.preprocessing import RobustScaler from lale.lib.lale import ConcatFeatures as Concat from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=(NoOp() & RobustScaler())  >> Concat()  >> QuadraticDiscriminantAnalysis(),  max_evals=20,  scoring=""r2"", )",0.98
"from lale.lib.lale import Hyperopt from lale.lib.lale import NoOp from sklearn.preprocessing import RobustScaler from lale.lib.lale import ConcatFeatures as Concat from sklearn.linear_model import LogisticRegression import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=(NoOp() & RobustScaler()) >> Concat() >> LogisticRegression(),  max_evals=20,  scoring=""r2"", )",0.98
"from lale.lib.lale import Hyperopt from lale.lib.lale import NoOp from sklearn.preprocessing import MinMaxScaler from lale.lib.lale import ConcatFeatures as Concat from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=(NoOp() & MinMaxScaler())  >> Concat()  >> QuadraticDiscriminantAnalysis(),  max_evals=20,  scoring=""r2"", )",0.98
"from lale.lib.lale import Hyperopt from lale.lib.lale import NoOp from sklearn.preprocessing import MinMaxScaler from lale.lib.lale import ConcatFeatures as Concat from sklearn.linear_model import LogisticRegression import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=(NoOp() & MinMaxScaler()) >> Concat() >> LogisticRegression(),  max_evals=20,  scoring=""r2"", )",0.986667
"from lale.lib.lale import Hyperopt from lale.lib.lale import NoOp from sklearn.preprocessing import StandardScaler from lale.lib.lale import ConcatFeatures as Concat from sklearn.linear_model import LogisticRegression import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=(NoOp() & StandardScaler()) >> Concat() >> LogisticRegression(),  max_evals=20,  scoring=""r2"", )",0.98
"from lale.lib.lale import Hyperopt from lale.lib.lale import NoOp from sklearn.preprocessing import StandardScaler from lale.lib.lale import ConcatFeatures as Concat from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=(NoOp() & StandardScaler())  >> Concat()  >> QuadraticDiscriminantAnalysis(),  max_evals=20,  scoring=""r2"", )",0.986667
"from lale.lib.lale import Hyperopt from lale.lib.lale import NoOp from sklearn.preprocessing import Normalizer from lale.lib.lale import ConcatFeatures as Concat from sklearn.tree import DecisionTreeClassifier import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=(NoOp() & Normalizer()) >> Concat() >> DecisionTreeClassifier(),  max_evals=20,  scoring=""r2"", )",0.953333
"from lale.lib.lale import Hyperopt from lale.lib.lale import NoOp from sklearn.preprocessing import RobustScaler from lale.lib.lale import ConcatFeatures as Concat from sklearn.tree import DecisionTreeClassifier import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=(NoOp() & RobustScaler())  >> Concat()  >> DecisionTreeClassifier(),  max_evals=20,  scoring=""r2"", )",0.973333


## 8. Use pipeline accuracy to compute new PDDL action costs

In [10]:
feedback = optimizer.get_feedback(trained_pipelines)
G2L.feedback(feedback)
costs_table = [[str(k), G2L.costs[k]] for k in G2L.costs.keys()]
display(HTML(tabulate(costs_table, headers=['Pipeline element', 'Computed cost'], tablefmt='html')))

Pipeline element,Computed cost
(,0
NoOp(),63
&,0
Normalizer(),73
),0
>>,0
Concat(),63
LogisticRegression(),57
QuadraticDiscriminantAnalysis(),57
RobustScaler(),63


## 9. Invoke planner again on updated PDDL task and translate to pipelines

In [11]:
new_pipelines = G2L.get_plans(num_pipelines=NUM_PIPELINES, constraints=CONSTRAINTS)

with open('../output/domain_after_feedback.pddl', 'w') as f:
    f.write(G2L.last_task['domain'])
with open('../output/problem_after_feedback.pddl', 'w') as f:
    f.write(G2L.last_task['problem'])
with open('../output/second_planner_call.json', 'w') as f:
    f.write(json.dumps(G2L.last_planner_object, indent=3))

new_pipeline_table = [[pipelines[idx]['pipeline'], new_pipelines[idx]['pipeline']] for idx in range(min(len(pipelines), len(new_pipelines)))]
display(HTML(tabulate(new_pipeline_table, headers=['First iteration', 'After feedback'], tablefmt='html')))


Generating PDDL description...
Obtaining 10 plans with constraints []
Running the planner...
Created domain file in /tmp/4be13073-fe2c-4a8e-9c8a-ffd3c41cee87/domain.pddl
Created problem file in /tmp/4be13073-fe2c-4a8e-9c8a-ffd3c41cee87/problem.pddl
Running kstar /tmp/4be13073-fe2c-4a8e-9c8a-ffd3c41cee87/domain.pddl /tmp/4be13073-fe2c-4a8e-9c8a-ffd3c41cee87/problem.pddl --search "kstar(blind(),k=50,json_file_to_dump=result.json)"
Plans returned after 0.6532762050628662 seconds.
Translating plans to LALE pipelines.


First iteration,After feedback
( NoOp() & ( Normalizer() ) ) >> Concat() >> LogisticRegression(),PCA() >> GaussianNB()
( NoOp() & ( Normalizer() ) ) >> Concat() >> QuadraticDiscriminantAnalysis(),PCA() >> KNeighborsClassifier()
( NoOp() & ( RobustScaler() ) ) >> Concat() >> QuadraticDiscriminantAnalysis(),PCA() >> GradientBoostingClassifier()
( NoOp() & ( RobustScaler() ) ) >> Concat() >> LogisticRegression(),PCA() >> ExtraTreesClassifier()
( NoOp() & ( MinMaxScaler() ) ) >> Concat() >> QuadraticDiscriminantAnalysis(),PCA() >> RandomForestClassifier()
( NoOp() & ( MinMaxScaler() ) ) >> Concat() >> LogisticRegression(),StandardScaler() >> GaussianNB()
( NoOp() & ( StandardScaler() ) ) >> Concat() >> LogisticRegression(),MinMaxScaler() >> GaussianNB()
( NoOp() & ( StandardScaler() ) ) >> Concat() >> QuadraticDiscriminantAnalysis(),StandardScaler() >> KNeighborsClassifier()
( NoOp() & ( Normalizer() ) ) >> Concat() >> DecisionTreeClassifier(),MinMaxScaler() >> KNeighborsClassifier()
( NoOp() & ( RobustScaler() ) ) >> Concat() >> DecisionTreeClassifier(),StandardScaler() >> ExtraTreesClassifier()
