In [187]:
import bqplot as bq
import networkx
import numpy as np
import ipywidgets as ipw
from copy import copy, deepcopy
from IPython.display import display, HTML

In [188]:
# With indices as dag nodes

class Workflow(object):
    def __init__(self, name):
        self.dag = networkx.graph.Graph()
        self.name = name
        self.index_dict = {}
        #self.fig_layout = ipw.Layout(width='600px', height='800px')
        self.fig_layout = ipw.Layout(width='1200px', height='800px')
        self._task_names = []
    
    def add_task(self, task, dependencies=None):
        """
        Add instantiated Task object to the Workflow.
        If dependencies=None, then this task will be executed
        as soon as possible upon starting the Workflow.
        A Task may appear only once per Workflow.
        """
        
        # Ensure that tasks are not repeated.
        if task in self.dag.nodes():
            raise ValueError("Task already present in Workflow. Please pass a deepcopy if you wish to repeat the Task.")
        elif task.name in self._task_names:
            raise ValueError("Task name '{}' already present in Workflow. Please use a unique name.".format(task.name))
            
        # Determine index for this Task in this Workflow
        index = self.dag.number_of_nodes()
        # Inform workflow and task of this assignment
        self.dag.add_node(task, index=index)
        task.index[self] = index
        
        if dependencies is not None:
            for dependency in dependencies:
                self.dag.add_edge(dependency, task)
                
    def get_task_by_name(self, name):
        "Return the Task object with the given name in this Workflow."
        for task in self.dag.nodes():
            try:
                if task.name == name:
                    return task
            except AttributeError:
                print("{} has no name.".format(task))

    def draw_dag(self):
        "Return bqplot figure representing DAG."
        
        pos = networkx.nx_pydot.graphviz_layout(self.dag, prog='dot')
        N = self.dag.number_of_nodes()
        
        x, y = [[pos[node][i] for node in self.dag.nodes()] for i in range(2)]

        node_data = [
            {
                'label': str(node.index[self]),
                'shape': 'rect',
                **node.get_user_dict()
            }
            for node in self.dag.nodes()
        ]
        link_data = [
            {
                'source': source.index[self],
                'target': target.index[self]
            } 
            for source, target in self.dag.edges()
        ]

        xs = bq.LinearScale()
        ys = bq.LinearScale()
        scales = {'x': xs, 'y': ys}
        
        graph = bq.Graph(
            node_data=node_data,
            link_data=link_data,
            scales=scales,
            link_type='line',
            highlight_links=True,
            x=x, y=y,
            interactions = {
                'click': 'tooltip',
                'hover': 'select'
            }
        )
        
        graph.tooltip = bq.Tooltip(
            fields=self.dag.nodes()[0].user_fields
        )
        
        fig = bq.Figure(marks=[graph], layout=self.fig_layout)
        
        return fig
        
    

In [189]:
class Task(object):
    "One step in a Workflow. Must have a unique name."
    def __init__(self, name, input_files=[], output_files=[], 
                 params={}, num_cores=1, task_type='',
                substitute_strings=[], substitute_lists=[]):
        
        # Name of task (must be unique)
        self.name = name
        
        # Type of task (Notebook, CommandLine, etc.)
        self.task_type = task_type
        
        # List of other Tasks which must complete 
        # before this Task can be run.
        self.dependencies = []
        
        # List of Tasks which depend on this Task.
        self.children = []
        
        # Files which this Task takes as input 
        # and must be present before run.
        self.input_files = input_files
        
        # Files which are generated or modified by this Taks.
        self.output_files = output_files
        
        # Number of CPU cores to run the task on
        self.num_cores = num_cores
        
        # Map workflow to the node index which
        # represents this task in that workflow.
        # Tasks may be in multiple workflows,
        self.index = {}
        
        # Parameters to replace in other arguments
        self.params = params
        
        # List of names of fields to substitute params.
        # If a child class calls Task.__init__ with
        # substitute_strings or substitute_lists as
        # nonempty lists, they will be included here.
        self._substitute_strings = [
            'name',
            'task_type'
        ] + substitute_strings
        self._substitute_lists = [
            'input_files',
            'output_files'
        ] + substitute_lists
        
        self._substitute_fields()
        
        # Fields which are of interest to the user
        self.user_fields = [
            'name', 
            'task_type', 
            'input_files', 
            'output_files',
            'num_cores'
        ]
    
    def get_user_dict(self):
        "Generate dictionary of user field names and values"
        return {
            field: getattr(self, field) 
            for field in self.user_fields
        }
            
    def _substitute_fields(self):
        "Replace fields according to params dict."
        for field in self._substitute_strings:
            # Read current value
            before = getattr(self,field)
            # Replace fields
            after = before.format(**self.params)
            # Write new value
            setattr(self, field, after)
            
        for list_name in self._substitute_lists:
            field_list = getattr(self, list_name)
            # Read current values
            for i, before in enumerate(field_list):
                # Replace fields
                after = before.format(**self.params)
                # Write to working copy of list
                field_list[i] = after
            # Write working copy to actual list
            setattr(self, list_name, field_list)
                
    def _run(self):
        """
        Run this Task. Should be executed by a Workflow.
        This function should be overloaded by child classes.
        """
        print("Task run.")
        

class NotebookTask(Task):
    """
    
    Jupyter Notebook which should appear as a node in the Workflow DAG.
    If interactive == True, a kernel will be started and the
    notebook will be opened for user to interact with.
    Workflow will be blocked in the meantime.
    If false, notebook will be executed without opening,
    and Workflow will continue upon successful execution.
    """
    def __init__(self, name, interactive=True, **kwargs):
        self.task_type = 'NotebookTask'
        self.interactive = interactive
        
        super().__init__(name=name, **kwargs)
    
    def _run(self):
        print("Notebook run.")
    
    def _unblock(self):
        """
        Return control to Workflow after interactive notebook
        execution is complete.
        """
        pass

    
class CommandLineTask(Task):
    "Command Line Task to be executed as a Workflow step."
    def __init__(self, name, command, **kwargs):
        
        self.command = command
        
        super().__init__(
            name=name,
            task_type='CommandLineTask',
            substitute_strings=['command'], 
            **kwargs
        )
        
    
    def _run(self):
        print("Command Line run.")

        
class PythonFunctionTask(Task):
    "Python function call to be executed as a Workflow step."
    def __init__(self, name, fun, fun_args, fun_kwargs, **kwargs):
        # Actual callable function to be executed.
        self.fun = fun
        
        super().__init__(
            name=name, 
            task_type='PythonFunctionTask',
            **kwargs
        )
    
    def _run(self):
        print("Python function run.")
        return self.fun(*fun_args, **fun_kwargs)
    
class BatchTask(Task):
    "Task which will be submitted to a batch queue to execute."
    def __init__(self, name, batch_script, **kwargs):
        self.batch_file = batch_script
        
        super().__init__(
            name=name, 
            task_type='BatchTask',
            **kwargs
        )
        
    def _run(self):
        print("Batch run.")

# Droplet Workflow

In [190]:
droplet_wf = Workflow('Droplet Workflow')

# Radius of droplets (Angstroms)
droplet_radii = range(20,100, 10)
# Shape of droplets
shape = 'sphere'
# Base directory for computations
base_dir = '$SCRATCH/droplet'

# Number of substrate images in each dimension
nx, ny = 10, 10

# Number of parts (dump files) per simulation
parts_per_sim = 3

# Generate substrate
gen_mica_task = CommandLineTask(
    name='gen_mica_{nx}x{ny}',
    command='{base_dir}/gen_droplet/scripts/gen_mica.sh {nx} {ny} {out_file}',
    output_files = [
        "{out_file}"
    ],
    params=dict(
        base_dir=base_dir,
        nx=nx,
        ny=ny,
        out_file="{base_dir}/gen_droplet/lammps_data/mica_{nx}x{ny}.data".format(
            base_dir=base_dir,
            nx=nx,
            ny=ny
        )
    )
)
droplet_wf.add_task(gen_mica_task)

# Loop over droplet sizes
for radius in droplet_radii:
    # Create droplet
    gen_droplet_task = CommandLineTask(
        name="gen_droplet-{radius}A",
        command="{base_dir}/gen_droplet/bin/waterdroplet_tip4p_new.out {radius} {shape}",
        output_files = [
            "{out_file}"
        ],
        params=dict(
            base_dir=base_dir,
            radius=radius,
            shape=shape,
            out_file="{base_dir}/gen_droplet/dump/droplet_{radius}A.lammpstrj".format(
                base_dir=base_dir,
                radius=radius
            )
        )
    )      
    droplet_wf.add_task(gen_droplet_task)
    
    # Combine with substrate
    combine_task = CommandLineTask(
        name="combine-{radius}A",
        command="{base_dir}/gen_droplet/scripts/combine_sub_strip.pl {substrate} {film} {gap}",
        input_files = [
            "{substrate}",
            "{film}"
        ],
        output_files = [
            "{base_dir}/gen_droplet/lammps_data/droplet_on_mica-{radius}A.data"
        ],
        params=dict(
            base_dir=base_dir,
            radius=radius,
            substrate=gen_mica_task.output_files[0],
            film=gen_droplet_task.output_files[0],
            gap=radius,
        )
    )
    droplet_wf.add_task(
        combine_task,
        dependencies=[
            gen_mica_task,
            gen_droplet_task
        ]
    )
    
    simulate_task = BatchTask(
        name="simulate-{radius}A",
        batch_script="{base_dir}/sub_scripts/simulate_{radius}A.batch",
        input_files = [
            combine_task.output_files[0],
            "{base_dir}/lammps_scripts/simulate_{radius}A.batch"
        ],
        output_files = [
            "{base_dir}/data/{radius}A/atom"+str(part)
            for part in range(1,parts_per_sim+1)
        ],
        num_cores=parts_per_sim,
        params=dict(
            base_dir=base_dir,
            radius=radius,
        )
    )
    droplet_wf.add_task(
        simulate_task,
        dependencies=[combine_task]
    )
    
    # Analyze each part independently
    for part in range(1,parts_per_sim+1):
        parse_task = CommandLineTask(
            name='parse-{radius}A_atom{part}',
            command='{base_dir}/exec/parse.sh {infile} {outfile}',
            input_files = ["{infile}"],
            output_files = ["{outfile}"],
            params=dict(
                base_dir=base_dir,
                radius=radius,
                part=part,
                infile=simulate_task.output_files[part-1],
                outfile="{base_dir}/results/{radius}A/waters.txt".format(
                    base_dir=base_dir,
                    radius=radius
                )
            )
        )
        droplet_wf.add_task(
            parse_task,
            dependencies=[simulate_task]
        )
        
        analyze_task = CommandLineTask(
            name='analyze-{radius}A_atom{part}',
            command='{base_dir}/exec/analyze.sh {infile} {outfile}',
            input_files = ["{infile}"],
            output_files = ["{outfile}"],
            params=dict(
                base_dir=base_dir,
                radius=radius,
                part=part,
                infile=parse_task.output_files[0],
                outfile="{base_dir}/results/{radius}A/calculated.txt".format(
                    base_dir=base_dir,
                    radius=radius
                )
            )
        )
        
        droplet_wf.add_task(
            analyze_task,
            dependencies=[parse_task]
        )
    
    combine_parts_task = CommandLineTask(
        name='combine_parts-{radius}A',
        command='{base_dir}/results/combineParts.sh {radius}A',
        input_files = [
            "{base_dir}/results/{radius}A/atom"+str(part)+"/calculated.txt"
            for part in range(1,parts_per_sim+1)
        ],
        output_files=["{base_dir}/results/{radius}A/combined.txt"],
        params=dict(
            base_dir=base_dir,
            radius=radius,
        )
    )
    droplet_wf.add_task(
        combine_parts_task,
        dependencies=[droplet_wf.get_task_by_name(
            'analyze-{radius}A_atom{part}'.format(
                radius=radius,
                part=part
            )
        )
        for part in range(1,parts_per_sim+1)
        ]
    )
    
combine_sims_task = CommandLineTask(
    name='combine_sims',
    command='{base_dir}/results/combineSims.sh',
    input_files = [
        "{base_dir}/results/"+str(radius)+"A/combined.txt"
        for radius in droplet_radii
    ],
    output_files=["{base_dir}/results/allResults.txt"],
    params=dict(base_dir=base_dir)
)
droplet_wf.add_task(
    combine_sims_task,
    dependencies=[
        droplet_wf.get_task_by_name(
            'combine_parts-{radius}A'.format(
                radius=radius
            )
        )
        for radius in droplet_radii
    ]
)
analysis_notebook_task = NotebookTask(
    name='analysis_notebook',
    interactive=True,
)
droplet_wf.add_task(
    analysis_notebook_task,
    dependencies=[combine_sims_task]
)

In [191]:
l=[]
for source, target in droplet_wf.dag.edges():
    source_name = source.name if source is not None else None
    target_name = target.name if target is not None else None
    l.append((source_name,target_name))

In [192]:
l[-5:]

[('analyze-90A_atom2', 'combine_parts-90A'),
 ('parse-90A_atom3', 'analyze-90A_atom3'),
 ('analyze-90A_atom3', 'combine_parts-90A'),
 ('combine_parts-90A', 'combine_sims'),
 ('combine_sims', 'analysis_notebook')]

In [211]:
fig = droplet_wf.draw_dag()
tb = bq.Toolbar(figure=fig)
display(fig,tb)

A Jupyter Widget

A Jupyter Widget

In [102]:
[1,2,3,2].index(2)

1

In [93]:
w = Workflow('Example')
c = CommandLineTask(name='cmd', command_string="echo 'hello, bash'")
n = NotebookTask(name='nbtask')
p = PythonFunctionTask(name='myfun', fun=lambda: print('hello python'))

w.add_task(c)
w.add_task(p)
w.add_task(n, [c,p])

for i in range(5):
    w.add_task(
        CommandLineTask(name='cmd{}'.format(i+1), command_string = "echo 'test {}'".format(i+1)), 
        dependencies=[p]
    )
    pass

f = w.draw_dag()
f

A Jupyter Widget

In [83]:
c

<__main__.CommandLineTask at 0x2aaab6cb1978>

In [82]:
deepcopy(c)

TypeError: cannot serialize '_io.FileIO' object