In [1]:
# default_exp run_flow

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# export


import asyncio
import multiprocessing
import os
import subprocess
import sys
from itertools import product
from pathlib import Path
from typing import Any, Dict, Iterable

import pandas as pd
from fastcore.script import call_parse
from nbdev.export import find_default_export, get_config, read_nb

from sciflow.utils import chunks, get_flow_path, prepare_env

# Verify and Run Sciflow Flows

In [4]:
nb_path = Path(Path(".").resolve(), "test", "test_export.ipynb")
flow_path = get_flow_path(nb_path, flow_provider="sagemaker")
nb = read_nb(nb_path)
module_name = find_default_export(nb["cells"]).replace(".", "/")
test_module = os.path.join(get_config().path("lib_path"), f"{module_name}.py")
flows_dir = get_config(cfg_name="test/settings.ini").path("flows_path")
flow_name = os.path.basename(test_module)

# Verify or Run an Individual Flow

> `subprocess` is used to run flows as most flow providers bundle a CLI which makes for a consistent execution experience with minimal adaptation.

In [5]:
# export


def run_shell_cmd(script: str):
    pipe = subprocess.Popen(
        "%s" % script, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True
    )
    output = pipe.communicate()[0]
    return pipe, output.decode("utf-8").strip()

In [6]:
# export


def make_shell_cmd(
    flow_nb_path, flow_provider="metaflow", flow_command="show", params=None
):
    prepare_env()
    if flow_nb_path.suffix == ".ipynb":
        flow_path = get_flow_path(flow_nb_path, flow_provider=flow_provider)
    else:
        flow_path = flow_nb_path
    if params:
        args = " ".join([f"--{k} {v}" for k, v in params.items()])

        flow_command = f"{flow_command} {args}"

    return f"python '{flow_path}' {flow_command}"

In [7]:
# export


def check_call_flow(
    flow_nb_path, flow_provider="metaflow", flow_command="show", params=None
):
    cmd = make_shell_cmd(flow_nb_path, flow_provider, flow_command, params)
    pipe, output = run_shell_cmd(cmd)
    return pipe.returncode, output

In [8]:
ret_code, output = check_call_flow(
    nb_path, flow_provider="metaflow", flow_command="show"
)
assert ret_code == 0

In [9]:
# slow

ret_code, output = check_call_flow(
    nb_path, flow_provider="metaflow", flow_command="run"
)
assert ret_code == 0
print(output)

Metaflow 2.5.2 executing TestExportFlow for user:'Donal Simmie'
Validating your flow...
    The graph looks good!
Running pylint...
    Pylint is happy!
2022-05-11 06:20:23.231 Workflow starting (run-id 1652250022971798):
2022-05-11 06:20:23.293 [1652250022971798/start/1 (pid 8965)] Task is starting.
2022-05-11 06:20:28.021 [1652250022971798/start/1 (pid 8965)] 3
2022-05-11 06:20:28.344 [1652250022971798/start/1 (pid 8965)] Task finished successfully.
2022-05-11 06:20:28.424 [1652250022971798/preprocess/2 (pid 8968)] Task is starting.
2022-05-11 06:20:33.153 [1652250022971798/preprocess/2 (pid 8968)] Preprocessing input data from /home/sagemaker-user/git/sciflow/nbs...
2022-05-11 06:20:34.490 [1652250022971798/preprocess/2 (pid 8968)] Task finished successfully.
2022-05-11 06:20:34.573 [1652250022971798/train/3 (pid 8971)] Task is starting.
2022-05-11 06:20:39.297 [1652250022971798/train/3 (pid 8971)] Training /home/sagemaker-user/git/sciflow on /home/sagemaker-user/git/sciflow/nbs...


In [12]:
ret_code, output = check_call_flow(
    nb_path, flow_provider="sagemaker", flow_command="show"
)
assert ret_code == 0

In [13]:
# slow

ret_code, output = check_call_flow(
    nb_path, flow_provider="sagemaker", flow_command="run"
)
assert ret_code == 0
print(output)

Starting Sciflow generated pipeline: pipeline-2022-05-11-06-23-18-450
{'PipelineArn': 'arn:aws:sagemaker:eu-west-1:368653567616:pipeline/test-export', 'PipelineExecutionArn': 'arn:aws:sagemaker:eu-west-1:368653567616:pipeline/test-export/execution/jkax8pbvfgdh', 'PipelineExecutionDisplayName': 'execution-1652250202244', 'PipelineExecutionStatus': 'Executing', 'CreationTime': datetime.datetime(2022, 5, 11, 6, 23, 22, 151000, tzinfo=tzlocal()), 'LastModifiedTime': datetime.datetime(2022, 5, 11, 6, 23, 22, 151000, tzinfo=tzlocal()), 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:eu-west-1:368653567616:user-profile/d-likrmmebxomz/donal', 'UserProfileName': 'donal', 'DomainId': 'd-likrmmebxomz'}, 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:eu-west-1:368653567616:user-profile/d-likrmmebxomz/donal', 'UserProfileName': 'donal', 'DomainId': 'd-likrmmebxomz'}, 'ResponseMetadata': {'RequestId': 'd787e8a3-0d6f-48fb-8f04-2540f4760267', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-

# Verify/Run all Flows

In [14]:
# export


def check_call_flows(
    config,
    flow_provider="metaflow",
    flow_command="show",
    ignore_suffix=None,
    exit_on_error=True,
):
    flow_results = {}
    flows_dir = Path(config.path("flows_path"), flow_provider)

    if ignore_suffix:
        flow_file_names = [
            p for p in os.listdir(flows_dir) if not p.endswith(ignore_suffix)
        ]
    else:
        flow_file_names = os.listdir(flows_dir)
    ret_codes = []
    exit_code = 0
    for flow_file_name in flow_file_names:
        flow_name = os.path.basename(flow_file_name)
        if flow_file_name.startswith("_sciflow"):
            continue
        if flow_file_name.endswith(".py"):
            ret_code, output = check_call_flow(
                Path(flows_dir, flow_file_name), flow_command=flow_command
            )
            flow_results[flow_name] = ret_code, output
            if ret_code == 0:
                print(f"Flow: {flow_name} {flow_command} verified")
            else:
                print(
                    f"Flow: {flow_name} {flow_command} verification failed\nDetails:\n{output}"
                )
            ret_codes.append(ret_code)
    if any([rc != 0 for rc in ret_codes]):
        exit_code = 1
        try:
            # Exit with an error code if running from a non interactive Python environment.
            get_ipython().__class__.__name__
        except NameError:
            if exit_on_error:
                return sys.exit(exit_code)
    return exit_code

In [15]:
check_call_flows(get_config(cfg_name="test/settings.ini"))

Flow: test_export.py show verified
Flow: test_data_handling.py show verified
Flow: test_module.py show verified
Flow: test_multistep_no_params.py show verified
Flow: test_multistep.py show verified


0

In [16]:
# slow

check_call_flows(
    get_config(cfg_name="test/settings.ini"),
    flow_command="--no-pylint run",
    ignore_suffix="_no_params.py",
)

Flow: test_export.py --no-pylint run verified
Flow: test_data_handling.py --no-pylint run verified
Flow: test_module.py --no-pylint run verified
Flow: test_multistep.py --no-pylint run verified


0

In [17]:
check_call_flows(get_config(cfg_name="test/settings.ini"), flow_provider="sagemaker")

Flow: test_export.py show verified
Flow: test_data_handling.py show verified
Flow: test_module.py show verified
Flow: test_multistep_no_params.py show verified
Flow: test_multistep.py show verified


0

In [18]:
# slow

nb_path = Path(Path(".").resolve(), "test", "test_multistep.ipynb")
ret_code, output = check_call_flow(
    nb_path,
    flow_command="run",
    params={"traffic_percent": 1, "model_level": "dispatcher"},
)
print(output)
assert ret_code == 0

Metaflow 2.5.2 executing TestMultistepFlow for user:'Donal Simmie'
Validating your flow...
    The graph looks good!
Running pylint...
    Pylint is happy!
2022-05-11 06:30:30.209 Workflow starting (run-id 1652250629954348):
2022-05-11 06:30:30.274 [1652250629954348/start/1 (pid 9578)] Task is starting.
2022-05-11 06:30:31.654 [1652250629954348/start/1 (pid 9578)] The first step
2022-05-11 06:30:32.036 [1652250629954348/start/1 (pid 9578)] Task finished successfully.
2022-05-11 06:30:32.127 [1652250629954348/preprocess/2 (pid 9585)] Task is starting.
2022-05-11 06:30:33.491 [1652250629954348/preprocess/2 (pid 9585)] I captialised the message: THE FIRST STEP
2022-05-11 06:30:33.904 [1652250629954348/preprocess/2 (pid 9585)] Task finished successfully.
2022-05-11 06:30:33.990 [1652250629954348/fit/3 (pid 9592)] Task is starting.
2022-05-11 06:30:35.750 [1652250629954348/fit/3 (pid 9592)] Task finished successfully.
2022-05-11 06:30:35.831 [1652250629954348/evaluate/4 (pid 9599)] Task is 

# Aynsc Flow Running

> Run the flow you are working on from the notebook you are working on. This maximises the amount of experiments you can run as you don't have down time. While long running tasks are running you can keep exploring! :-)

In [25]:
# export


async def flow_task(
    flow_nb_path, flow_provider="metaflow", flow_command="run", params=None
):
    cmd = make_shell_cmd(flow_nb_path, flow_provider, flow_command, params)

    proc = await asyncio.create_subprocess_shell(
        cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
    )

    stdout, stderr = await proc.communicate()

    # print(f"[{cmd!r} exited with {proc.returncode}]")
    err = ""
    out = ""
    if stderr:
        err = f'[stderr]\n{stderr.decode("utf-8").strip()}'
    if stdout:
        out = f'[stdout]\n{stdout.decode("utf-8").strip()}'

    return proc.returncode, err + out

In [20]:
# export


def run_flow_async(
    flow_nb_path, flow_provider="metaflow", flow_command="run", params=None
):
    loop = asyncio.get_event_loop()
    task = loop.create_task(
        flow_task(flow_nb_path, flow_provider, flow_command, params)
    )
    return task

In [21]:
# slow

task = run_flow_async(
    Path(Path(".").resolve(), "test", "test_multistep.ipynb"),
    params={"traffic_percent": 10, "workers": 12},
)
task

<Task pending name='Task-4' coro=<flow_task() running at /tmp/ipykernel_8935/3217104127.py:3>>

In [22]:
# slow
await task
assert 0 == task.result()[0]

In [26]:
# slow

task = run_flow_async(
    Path(Path(".").resolve(), "test", "test_export.ipynb"),
    flow_provider="sagemaker",
    params={"some_param": "async"},
)
task

<Task pending name='Task-16' coro=<flow_task() running at /tmp/ipykernel_8935/2576585770.py:3>>

In [27]:
# slow

await task
assert 0 == task.result()[0]

In [28]:
# slow

task = run_flow_async(
    Path(Path(".").resolve(), "test", "test_multistep.ipynb"),
    flow_provider="sagemaker",
    params={"traffic_percent": 10, "workers": 12},
)
task

<Task pending name='Task-22' coro=<flow_task() running at /tmp/ipykernel_8935/2576585770.py:3>>

In [29]:
# slow

await task
assert 0 == task.result()[0]

In [30]:
# export


async def run_flows_async(
    config,
    flow_provider="metaflow",
    flow_command="run",
    params=None,
    ignore_suffix=None,
    exit_on_error=True,
):
    flow_tasks = {}
    flows_dir = Path(config.path("flows_path"), flow_provider)

    if ignore_suffix:
        flow_file_names = [
            p for p in os.listdir(flows_dir) if not p.endswith(ignore_suffix)
        ]
    else:
        flow_file_names = os.listdir(flows_dir)
    ret_codes = []
    exit_code = 0
    loop = asyncio.get_event_loop()

    for flow_file_name in flow_file_names:
        flow_name = os.path.basename(flow_file_name)
        if flow_file_name.startswith("_sciflow"):
            continue
        if flow_file_name.endswith(".py"):
            task = loop.create_task(
                flow_task(
                    Path(flows_dir, flow_file_name), flow_provider, flow_command, params
                )
            )
            flow_tasks[flow_name] = task

    for flow_name, task in flow_tasks.items():
        await task
        ret_code = task.result()[0]
        if ret_code == 0:
            print(f"Flow: {flow_name} {flow_command} verified")
        else:
            print(
                f"Flow: {flow_name} {flow_command} verification failed\nDetails:\n{output}"
            )
        ret_codes.append(ret_code)
    if any([rc != 0 for rc in ret_codes]):
        exit_code = 1
        try:
            # Exit with an error code if running from a non interactive Python environment.
            get_ipython().__class__.__name__
        except NameError:
            if exit_on_error:
                return sys.exit(exit_code)
    return exit_code

In [31]:
# slow

task = run_flows_async(
    get_config(cfg_name="test/settings.ini"),
    flow_command="--no-pylint run",
    ignore_suffix="_no_params.py",
)

In [32]:
# slow

await task

Flow: test_export.py --no-pylint run verified
Flow: test_data_handling.py --no-pylint run verified
Flow: test_module.py --no-pylint run verified
Flow: test_multistep.py --no-pylint run verified


0

In [34]:
# slow

task = run_flows_async(
    get_config(cfg_name="test/settings.ini"),
    flow_provider="sagemaker",
    flow_command="run",
    ignore_suffix="_no_params.py",
)

In [35]:
# slow

await task

Flow: test_export.py run verified
Flow: test_data_handling.py run verified
Flow: test_module.py run verified
Flow: test_multistep.py run verified


0

# Utilities to Search Parameter Space

In [36]:
param_grid = {
    "traffic_percent": [1, 5, 10, 20, 50, 100],
    "model_level": ["router", "dispatcher"],
    "workers": [1],
}

In [37]:
# export


def iter_param_grid(param_grid):
    # https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/model_selection/_search.py
    for p in [param_grid]:
        # Always sort the keys of a dictionary, for reproducibility
        items = sorted(p.items())
        if not items:
            yield {}
        else:
            keys, values = zip(*items)
            for v in product(*values):
                params = dict(zip(keys, v))
                yield params

In [38]:
assert [{"a": 1, "b": 1, "c": "hello"}, {"a": 2, "b": 1, "c": "hello"}] == list(
    iter_param_grid({"a": [1, 2], "b": [1], "c": ["hello"]})
)

In [39]:
# export


def sample_grid_space(param_grid: Dict[str, Iterable[Any]], num_samples: int):
    samples = []
    for i, sample in enumerate(iter_param_grid(param_grid)):
        samples.append(sample)
    if num_samples < len(samples):
        samples = pd.Series(samples).sample(num_samples).tolist()
    return samples

In [40]:
sample_space = sample_grid_space({"a": [1, 2], "b": [1], "c": ["hello"]}, 1)
assert sample_space[0]["b"] == 1
assert sample_space[0]["c"] == "hello"
assert sample_space[0]["a"] == 1 or sample_space[0]["a"] == 2

In [453]:
# export


async def search_batches(flow_nb_path, flow_provider, task_batches):
    futures = []
    loop = asyncio.get_event_loop()
    for task_batch in task_batches:
        tasks = [
            (
                loop.create_task(
                    flow_task(
                        flow_nb_path,
                        flow_provider,
                        flow_command="run",
                        params=param_spec,
                    )
                )
            )
            for param_spec in task_batch
        ]
        futures.append(await asyncio.wait(tasks))
    return futures

In [454]:
# export


def search_flow_grid(
    flow_nb_path,
    param_grid,
    flow_provider="metaflow",
    total_tasks=None,
    n_conc_tasks=None,
    local_mode=True,
):
    if total_tasks is None:
        total_tasks = len(list(iter_param_grid(param_grid)))

    if local_mode and n_conc_tasks is None:
        n_conc_tasks = int((multiprocessing.cpu_count() / 2) - 1)

    sample_space = sample_grid_space(param_grid, total_tasks)
    task_batches = list(chunks(sample_space, n_conc_tasks))
    futures = search_batches(flow_nb_path, flow_provider, task_batches)
    return futures

In [455]:
# export


def extract_results(future_tasks):
    completed_tasks = [
        item for sublist in [list(ft[0]) for ft in future_tasks] for item in sublist
    ]
    results = [t.result() for t in completed_tasks]
    return results

In [463]:
nb_path = Path(
    Path(".").resolve(),
    "test",
    "test_export.ipynb",
)
param_grid = {
    "traffic_percent": [1, 2, 3, 4, 5, 7, 8, 10, 20, 30, 40, 50],
    "model_level": ["dispatcher"],
    "workers": [1],
}

In [464]:
# slow

future_tasks = await ensure_future(
    search_flow_grid(
        nb_path,
        param_grid,
        flow_provider="sagemaker",
        total_tasks=None,
        n_conc_tasks=4,
    )
)

In [465]:
# slow

results = extract_results(future_tasks)
results

[(1,
  '[stderr]\nTraceback (most recent call last):\n  File "/opt/conda/envs/kernel-env/lib/python3.9/site-packages/sagemaker/workflow/pipeline.py", line 219, in upsert\n    response = self.create(role_arn, description, tags, parallelism_config)\n  File "/opt/conda/envs/kernel-env/lib/python3.9/site-packages/sagemaker/workflow/pipeline.py", line 121, in create\n    return self.sagemaker_session.sagemaker_client.create_pipeline(**kwargs)\n  File "/home/sagemaker-user/.local/lib/python3.9/site-packages/botocore/client.py", line 391, in _api_call\n    return self._make_api_call(operation_name, kwargs)\n  File "/home/sagemaker-user/.local/lib/python3.9/site-packages/botocore/client.py", line 719, in _make_api_call\n    raise error_class(parsed_response, operation_name)\nbotocore.exceptions.ClientError: An error occurred (ValidationException) when calling the CreatePipeline operation: Pipeline names must be unique within an AWS account and region. Pipeline with name (test-export) already e

In [468]:
# export


@call_parse
def sciflow_check_metaflows():
    check_call_flows(get_config())

In [469]:
# export


@call_parse
def sciflow_check_sagemaker_flows():
    check_call_flows(get_config(), flow_provider="sagemaker")

In [470]:
# export


@call_parse
def sciflow_run_metaflows():
    check_call_flows(get_config(), flow_command="--no-pylint run")

In [471]:
# export


@call_parse
def sciflow_run_sagemaker_flows():
    check_call_flows(get_config(), flow_command="run", flow_provider="sagemaker")