# Using AI planning to explore data science pipelines

In [1]:
from __future__ import print_function
import sys
import os
import types

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../grammar2lale")))

# Clean output directory where we store planning and result files
os.system('rm -rf ../output')
os.system('mkdir -p ../output')


0

## 1. Start with a Data Science grammar, in EBNF format

In [2]:
# This is the grammar file we will use
GRAMMAR_FILE="../grammar/dsgrammar-subset-sklearn.bnf"

# Copy grammar to the output directory
os.system("cp " + GRAMMAR_FILE + " ../output/")

0

## 2. Convert the grammar into an HTN domain and problem and use [HTN to PDDL](https://github.com/ronwalf/HTN-Translation) to translate to a PDDL task

In [3]:
from grammar2lale import Grammar2Lale

# Generate HTN specifications
G2L = Grammar2Lale(grammar_file=GRAMMAR_FILE)
with open("../output/domain.htn", "w") as f:
    f.write(G2L.htn_domain);
with open("../output/problem.htn", "w") as f:
    f.write(G2L.htn_problem);


Generating HTN specification from grammar
Printing HTN domain


## 3. Extend the PDDL task by integrating soft constraints

In [5]:
import re
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

# as a safety step, setting costs to 0 for any parts of the grammar that are non-identifiers (e.g., parens, etc.)
for token in G2L.htn.mapping:
    if not re.match('^[_a-zA-Z]', str(token)):
        G2L.costs[token] = 0
        
# prepare the list of possible constraints
constraint_options = G2L.get_selectable_constraints()
constraint_options.sort()    

# prepare a constraint selection form
interact_pipeline_params=interact.options(manual=True, manual_name='Generate PDDL')


pipelines = []
NUM_PIPELINES = 10
CONSTRAINTS = []


# This is the function that handles the constraint selection
@interact_pipeline_params(num_pipelines=widgets.IntSlider(value=10, min=1, max=100), 
                          constraints=widgets.SelectMultiple(options=constraint_options,
                                           description='Search constraints',
                                           rows=min(20, len(constraint_options))))
def select_pipeline_gen_params(num_pipelines, constraints):
    global pipelines
    global NUM_PIPELINES
    global CONSTRAINTS
    NUM_PIPELINES = num_pipelines
    CONSTRAINTS = list(constraints)
    G2L.create_pddl_task(NUM_PIPELINES, CONSTRAINTS)
    with open("../output/domain.pddl", "w") as f:
        f.write(G2L.last_task['domain'])
    with open("../output/problem.pddl", "w") as f:
        f.write(G2L.last_task['problem'])


interactive(children=(IntSlider(value=10, description='num_pipelines', min=1), SelectMultiple(description='Sea…

## 4. Use a planner to solve the planning task (in this case, [K*](https://github.com/ctpelok77/kstar) )

In [6]:
import json

G2L.run_pddl_planner()
with open("../output/first_planner_call.json", "w") as f:
    f.write(json.dumps(G2L.last_planner_object, indent=3))

Running the planner...
Created domain file in /tmp/81976c59-8cf6-4ba8-94a9-73bd33ac997a/domain.pddl
Created problem file in /tmp/81976c59-8cf6-4ba8-94a9-73bd33ac997a/problem.pddl
Running kstar /tmp/81976c59-8cf6-4ba8-94a9-73bd33ac997a/domain.pddl /tmp/81976c59-8cf6-4ba8-94a9-73bd33ac997a/problem.pddl --search "kstar(blind(),k=50,json_file_to_dump=result.json)"


sh: 1: Syntax error: Bad fd number


INFO     Running translator.
INFO     translator input: ['/tmp/81976c59-8cf6-4ba8-94a9-73bd33ac997a/domain.pddl', '/tmp/81976c59-8cf6-4ba8-94a9-73bd33ac997a/problem.pddl']
INFO     translator arguments: []
INFO     translator time limit: None
INFO     translator memory limit: None
INFO     callstring: /usr/bin/python3 /workspace/kstar/builds/release64/bin/translate/translate.py /tmp/81976c59-8cf6-4ba8-94a9-73bd33ac997a/domain.pddl /tmp/81976c59-8cf6-4ba8-94a9-73bd33ac997a/problem.pddl
Parsing...
Parsing: [0.000s CPU, 0.007s wall-clock]
Normalizing task... [0.000s CPU, 0.001s wall-clock]
Instantiating...
Generating Datalog program... [0.000s CPU, 0.001s wall-clock]
Normalizing Datalog program...
Normalizing Datalog program: [0.020s CPU, 0.014s wall-clock]
Preparing model... [0.000s CPU, 0.004s wall-clock]
Generated 318 rules.
Computing model... [0.030s CPU, 0.025s wall-clock]
507 relevant atoms
1714 auxiliary atoms
2221 final queue length
2426 total queue pushes
Completing instantiation

## 5. Translate plans to [LALE](https://github.com/IBM/lale) Data Science pipelines

In [8]:
# Translate to pipelines
pipelines = G2L.translate_to_pipelines(NUM_PIPELINES)

from pipeline_optimizer import PipelineOptimizer
from sklearn.datasets import load_iris

from lale.helpers import to_graphviz
from lale.lib.sklearn import *
from lale.lib.lale import ConcatFeatures as Concat
from lale.lib.lale import NoOp
from lale.lib.sklearn import KNeighborsClassifier as KNN
from lale.lib.sklearn import OneHotEncoder as OneHotEnc
from lale.lib.sklearn import Nystroem
from lale.lib.sklearn import PCA

optimizer = PipelineOptimizer(load_iris(return_X_y=True))
# instantiate LALE objects from pipeline definitions
LALE_pipelines = [optimizer.to_lale_pipeline(p) for p in pipelines]

# Display selected pipeline
def show_pipeline(pipeline):
    print("Displaying pipeline " + pipeline['id'] + ", with cost " + str(pipeline['score']))
    print(pipeline['pipeline'])
    print('==================================================================================')
    print()
    print()
    print()
    display(to_graphviz(pipeline['lale_pipeline']))

display_pipelines = [[p['pipeline'], p] for p in LALE_pipelines]    
    
interact(show_pipeline, pipeline=display_pipelines)

Translating plans to LALE pipelines.


interactive(children=(Dropdown(description='pipeline', options=(['Normalizer() >> LogisticRegression()', {'id'…

<function __main__.show_pipeline(pipeline)>

## 6. Run one of the pipelines on sample data

## 7. Train hyperparameters and evaluate the resulting LALE pipelines

In [9]:
trained_pipelines, dropped_pipelines = optimizer.evaluate_and_train_pipelines(pipelines)

Plan 1/10
Starting to optimize Normalizer() >> LogisticRegression()
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:04<00:00,  4.87trial/s, best loss: -0.8099999999999999]
Fit completed.
Predict completed.
Best accuracy: 0.8866666666666667
Completed optimization for Normalizer() >> LogisticRegression()
Plan 2/10
Starting to optimize Normalizer() >> GaussianNB()
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:04<00:00,  4.66trial/s, best loss: -0.97]
Fit completed.
Predict completed.
Best accuracy: 0.9733333333333334
Completed optimization for Normalizer() >> GaussianNB()
Plan 3/10
Starting to optimize Normalizer() >> DecisionTreeClassifier()
100%|█████████████████████████████████████████████████████████████

In [10]:
from IPython.display import HTML
from tabulate import tabulate
from lale.pretty_print import to_string

def show_pipeline_accuracy(tp):
    pipeline_table = [[to_string(p['trained_pipeline']).replace('\n', '<br/>'), str(p['best_accuracy'])] for p in tp]
    display(HTML(tabulate(pipeline_table, headers=['Pipeline', 'Accuracy'], tablefmt='html')))


show_pipeline_accuracy(trained_pipelines)

Pipeline,Accuracy
"from lale.lib.lale import Hyperopt from sklearn.preprocessing import Normalizer from sklearn.linear_model import LogisticRegression import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=Normalizer() >> LogisticRegression(), max_evals=20, scoring=""r2"" )",0.886667
"from lale.lib.lale import Hyperopt from sklearn.preprocessing import Normalizer from sklearn.naive_bayes import GaussianNB import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=Normalizer() >> GaussianNB(), max_evals=20, scoring=""r2"" )",0.973333
"from lale.lib.lale import Hyperopt from sklearn.preprocessing import Normalizer from sklearn.tree import DecisionTreeClassifier import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=Normalizer() >> DecisionTreeClassifier(),  max_evals=20,  scoring=""r2"", )",0.94
"from lale.lib.lale import Hyperopt from sklearn.preprocessing import Normalizer from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=Normalizer() >> QuadraticDiscriminantAnalysis(),  max_evals=20,  scoring=""r2"", )",0.973333
"from lale.lib.lale import Hyperopt from sklearn.preprocessing import Normalizer from sklearn.neighbors import KNeighborsClassifier as KNN import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=Normalizer() >> KNN(), max_evals=20, scoring=""r2"" )",1.0
"from lale.lib.lale import Hyperopt from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=StandardScaler() >> LogisticRegression(),  max_evals=20,  scoring=""r2"", )",0.98
"from lale.lib.lale import Hyperopt from sklearn.preprocessing import RobustScaler from sklearn.linear_model import LogisticRegression import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=RobustScaler() >> LogisticRegression(),  max_evals=20,  scoring=""r2"", )",0.98
"from lale.lib.lale import Hyperopt from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=MinMaxScaler() >> LogisticRegression(),  max_evals=20,  scoring=""r2"", )",0.94
"from lale.lib.lale import Hyperopt from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier as KNN import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=StandardScaler() >> KNN(), max_evals=20, scoring=""r2"" )",1.0
"from lale.lib.lale import Hyperopt from sklearn.preprocessing import RobustScaler from sklearn.neighbors import KNeighborsClassifier as KNN import lale lale.wrap_imported_operators() pipeline = Hyperopt(  estimator=RobustScaler() >> KNN(), max_evals=20, scoring=""r2"" )",1.0


## 8. Use pipeline accuracy to compute new PDDL action costs

In [11]:
feedback = optimizer.get_feedback(trained_pipelines)
G2L.feedback(feedback)
costs_table = [[str(k), G2L.costs[k]] for k in G2L.costs.keys()]
display(HTML(tabulate(costs_table, headers=['Pipeline element', 'Computed cost'], tablefmt='html')))

Pipeline element,Computed cost
Normalizer(),70
>>,0
LogisticRegression(),74
GaussianNB(),61
DecisionTreeClassifier(),77
QuadraticDiscriminantAnalysis(),61
KNeighborsClassifier(),50
StandardScaler(),54
RobustScaler(),54
MinMaxScaler(),77


## 9. Invoke planner again on updated PDDL task and translate to pipelines

In [12]:
new_pipelines = G2L.get_plans(num_pipelines=NUM_PIPELINES, constraints=CONSTRAINTS)

with open('../output/domain_after_feedback.pddl', 'w') as f:
    f.write(G2L.last_task['domain'])
with open('../output/problem_after_feedback.pddl', 'w') as f:
    f.write(G2L.last_task['problem'])
with open('../output/second_planner_call.json', 'w') as f:
    f.write(json.dumps(G2L.last_planner_object, indent=3))

new_pipeline_table = [[pipelines[idx]['pipeline'], new_pipelines[idx]['pipeline']] for idx in range(min(len(pipelines), len(new_pipelines)))]
display(HTML(tabulate(new_pipeline_table, headers=['First iteration', 'After feedback'], tablefmt='html')))


Generating PDDL description...
Obtaining 10 plans with constraints []
Running the planner...
Created domain file in /tmp/61e5fc61-b4f6-4948-90cb-ba0225c2f591/domain.pddl
Created problem file in /tmp/61e5fc61-b4f6-4948-90cb-ba0225c2f591/problem.pddl
Running kstar /tmp/61e5fc61-b4f6-4948-90cb-ba0225c2f591/domain.pddl /tmp/61e5fc61-b4f6-4948-90cb-ba0225c2f591/problem.pddl --search "kstar(blind(),k=50,json_file_to_dump=result.json)"


sh: 1: Syntax error: Bad fd number


INFO     Running translator.
INFO     translator input: ['/tmp/61e5fc61-b4f6-4948-90cb-ba0225c2f591/domain.pddl', '/tmp/61e5fc61-b4f6-4948-90cb-ba0225c2f591/problem.pddl']
INFO     translator arguments: []
INFO     translator time limit: None
INFO     translator memory limit: None
INFO     callstring: /usr/bin/python3 /workspace/kstar/builds/release64/bin/translate/translate.py /tmp/61e5fc61-b4f6-4948-90cb-ba0225c2f591/domain.pddl /tmp/61e5fc61-b4f6-4948-90cb-ba0225c2f591/problem.pddl
Parsing...
Parsing: [0.010s CPU, 0.007s wall-clock]
Normalizing task... [0.000s CPU, 0.001s wall-clock]
Instantiating...
Generating Datalog program... [0.000s CPU, 0.001s wall-clock]
Normalizing Datalog program...
Normalizing Datalog program: [0.020s CPU, 0.013s wall-clock]
Preparing model... [0.000s CPU, 0.004s wall-clock]
Generated 318 rules.
Computing model... [0.020s CPU, 0.024s wall-clock]
507 relevant atoms
1714 auxiliary atoms
2221 final queue length
2426 total queue pushes
Completing instantiation

First iteration,After feedback
Normalizer() >> LogisticRegression(),PCA() >> KNeighborsClassifier()
Normalizer() >> GaussianNB(),PCA() >> GradientBoostingClassifier()
Normalizer() >> DecisionTreeClassifier(),PCA() >> RandomForestClassifier()
Normalizer() >> QuadraticDiscriminantAnalysis(),PCA() >> ExtraTreesClassifier()
Normalizer() >> KNeighborsClassifier(),StandardScaler() >> KNeighborsClassifier()
StandardScaler() >> LogisticRegression(),RobustScaler() >> KNeighborsClassifier()
RobustScaler() >> LogisticRegression(),StandardScaler() >> ExtraTreesClassifier()
MinMaxScaler() >> LogisticRegression(),RobustScaler() >> ExtraTreesClassifier()
StandardScaler() >> KNeighborsClassifier(),StandardScaler() >> GradientBoostingClassifier()
RobustScaler() >> KNeighborsClassifier(),RobustScaler() >> GradientBoostingClassifier()
