In [None]:
# | default_exp converters.to_metaflow

In [None]:
%load_ext autoreload
%autoreload 2

# Imports

In [None]:
# | export


import logging
import os
import shutil
from importlib import reload
from pathlib import Path, PosixPath
from typing import Iterable

import numpy as np
from execnb.nbio import read_nb
from fastcore.script import Param, bool_arg, call_parse
from nbdev.config import get_config
from nbdev.doclinks import nbglob
from scilint.utils import configure_logging

from sciflow.params import ParamMeta, extract_param_meta, params_as_dict
from sciflow.parse_module import FuncDetails, extract_module_only, extract_steps
from sciflow.utils import (
    find_default_export,
    get_flow_path,
    indent_multiline,
    prepare_env,
    titleize,
)

reload(logging)
logger = logging.getLogger()

# Test Setup

In [None]:
nbs_dir = Path(".").resolve().parent
test_dir = Path(nbs_dir, "test")
nb_path = Path(test_dir, "test_export.ipynb")
nb = read_nb(nb_path)
module_name = find_default_export(nb["cells"]).replace(".", "/")
test_module = os.path.join(get_config().path("lib_path"), f"{module_name}.py")
configure_logging("debug")

## `rename_steps_for_metaflow`

In [None]:
# | export


def rename_steps_for_metaflow(steps):
    for i, step in enumerate(steps):
        if i == 0:
            step.name = "start"
        elif i == len(steps) - 1:
            step.name = "end"

In [None]:
steps = extract_steps(test_module)

In [None]:
no_steps = extract_steps(os.path.join(get_config().path("lib_path"), f"_modidx.py"))
assert len(no_steps) == 0

In [None]:
assert ["first", "preprocess", "train", "last"] == [step.name for step in steps]
rename_steps_for_metaflow(steps)
assert ["start", "preprocess", "train", "end"] == [step.name for step in steps]

# Convert Single Flow

## `nb_to_metaflow`

In [None]:
# | export


def nb_to_metaflow(nb_path: Path, flow_path: Path, silent=True):
    nb = read_nb(nb_path)
    lib_name = get_config().get("lib_name")
    module_name = find_default_export(nb["cells"])
    if not module_name:
        logger.debug(f"Ignoring conversion for nb with no default export: {nb_path}")
        return
    module_name = module_name
    path_sep_module_name = module_name.replace(".", "/")
    nb_name = os.path.basename(nb_path)
    exported_module = os.path.join(
        get_config().path("lib_path"), f"{path_sep_module_name}.py"
    )
    steps = extract_steps(exported_module)
    if len(steps) == 0:
        logger.debug(f"Ignoring conversion for nb with no named steps: {nb_path}")
        return
    orig_step_names = [step.name for step in steps]
    if len(steps) == 1:
        steps.append(FuncDetails("end", None, None, False, "", "pass"))
    params = params_as_dict(nb_path)
    if len(params) == 0:
        logger.warn(f"No params cell found for: {os.path.basename(nb_path)}")
    flow_class_name = f"{titleize(extract_module_only(module_name))}Flow"
    rename_steps_for_metaflow(steps)
    write_module_to_file(
        flow_path,
        flow_class_name,
        lib_name,
        module_name,
        orig_step_names,
        steps,
        params,
    )
    if not silent:
        print(
            f"Converted {nb_name} to {flow_class_name} in: {os.path.basename(flow_path)}"
        )

## `write_module_to_file`

In [None]:
# | export


def write_module_to_file(
    flow_path: Path,
    flow_class_name: str,
    lib_name: str,
    module_name: str,
    orig_step_names: Iterable[str],
    steps: Iterable[FuncDetails],
    params: dict,
):
    if not os.path.exists(flow_path.parent):
        os.mkdir(flow_path.parent)
    fq_module_name = f"{lib_name}.{module_name}"
    param_meta = extract_param_meta(fq_module_name, params)
    with open(flow_path, "w") as flow_file:
        flow_file.write("#!/usr/bin/env python\n")
        flow_file.write("# coding=utf-8\n")
        flow_file.write("# SCIFLOW GENERATED FILE - EDIT COMPANION NOTEBOOK\n")
        has_mf_param = any((p.has_metaflow_param for p in param_meta.values()))
        has_json_param = any((p.is_json_type for p in param_meta.values()))
        mf_params_import = "from metaflow import FlowSpec, step, current"
        if has_mf_param:
            mf_params_import += ", Parameter"
        if has_json_param:
            mf_params_import += ", JSONType"
            flow_file.write("import json\n")
        flow_file.write(mf_params_import + "\n")
        flow_file.write(f"from {fq_module_name} import {', '.join(orig_step_names)}\n")
        if len(params) > 0:
            flow_file.write(
                f"from {fq_module_name} import {', '.join(params.keys())}\n"
            )

        flow_file.write(f"\n\nclass {flow_class_name}(FlowSpec):\n")
        ind = "    "
        write_params(flow_file, param_meta, ind)
        flow_file.write("\n")
        write_steps(flow_file, steps, orig_step_names, param_meta, ind)
        flow_file.write("\n")

        flow_file.write('if __name__ == "__main__":\n')
        flow_file.write(f"{ind}{flow_class_name}()")

## `write_params`

In [None]:
# | export


def write_params(flow_file, param_metas, ind):
    for param in param_metas.keys():
        if param_metas[param].is_scalar:
            flow_file.write(f"{ind}{param} = Parameter('{param}', default={param})\n")
        elif param_metas[param].is_json_type:
            flow_file.write(
                f"{ind}{param} = Parameter('{param}', default=json.dumps({param}), type=JSONType)\n"
            )
        elif param_metas[param].instance_type == PosixPath:
            flow_file.write(
                f"{ind}{param} = Parameter('{param}', default=str({param}))\n"
            )

In [None]:
import tempfile

param_meta_dict = {
    "foo": ParamMeta(
        instance_type=str,
        is_scalar=True,
        is_json_type=False,
        persist_type="pickle",
        has_metaflow_param=True,
        has_sagemaker_param=True,
    )
}
with tempfile.TemporaryDirectory() as temp_dir:
    with open(Path(temp_dir, "flow"), "w") as flow_file:
        write_params(flow_file, param_meta_dict, "")
    with open(Path(temp_dir, "flow"), "r") as written_file:
        file_lines = written_file.readlines()
assert file_lines == ["foo = Parameter('foo', default=foo)\n"]

In [None]:
nb_path = Path(test_dir, "test_data_handling.ipynb")
params = params_as_dict(nb_path)
param_meta = extract_param_meta("sciflow.test.test_data_handling", params)

In [None]:
assert any((p.has_metaflow_param for p in param_meta.values()))
assert any((p.is_json_type for p in param_meta.values()))

## `format_arg`

In [None]:
# | export


def format_arg(arg, param_meta):
    if arg in param_meta and not param_meta[arg].has_metaflow_param:
        result = arg
    else:
        result = "self." + arg
    return result

In [None]:
assert "self.foo" == format_arg(
    "foo",
    {
        "foo": ParamMeta(
            instance_type=str,
            is_scalar=True,
            is_json_type=False,
            persist_type="pickle",
            has_metaflow_param=True,
            has_sagemaker_param=True,
        )
    },
)
assert "foo" == format_arg(
    "foo",
    {
        "foo": ParamMeta(
            instance_type=str,
            is_scalar=True,
            is_json_type=False,
            persist_type="pickle",
            has_metaflow_param=False,
            has_sagemaker_param=True,
        )
    },
)

## `write_steps`

In [None]:
# | export


def write_steps(flow_file, steps, orig_step_names, param_meta, ind):
    for i, step in enumerate(steps):
        flow_file.write(f"{ind}@step\n")
        flow_file.write(f"{ind}def {step.name}(self):\n")
        if step.docstring:
            flow_file.write(f"{indent_multiline(step.docstring, 2)}\n")

        if i < len(orig_step_names):
            flow_step_args = ""
            if len(step.args) > 0:
                flow_step_args = ", ".join(
                    [format_arg(a, param_meta) for a in step.args.split(",")]
                )
            if not step.has_return:
                flow_file.write(f"{ind}{ind}{orig_step_names[i]}({flow_step_args})\n")
            else:
                if step.return_stmt in param_meta:
                    raise ValueError(
                        f"[{os.path.basename(flow_file.name)}] step return variable {step.return_stmt} shadows a parameter name - parameters must be unique"
                    )
                flow_file.write(
                    f"{ind}{ind}results = {orig_step_names[i]}({flow_step_args})\n"
                )
                write_track_capture(flow_file, ind, 2)
        else:
            flow_file.write(f"{ind}{ind}pass\n")
            flow_file.write("\n")
        if i < len(steps) - 1:
            next_step = steps[i + 1].name
            flow_file.write(f"{ind}{ind}self.next(self.{next_step})\n")
        flow_file.write("\n")

## `write_track_capture`

In [None]:
# | export


def write_track_capture(flow_file, ind, num_indents):
    base_ind = "".join(np.repeat(ind, num_indents))
    flow_file.write(f"{base_ind}for key in results.keys():\n")
    flow_file.write(f"{base_ind}{ind}if key in self.__dict__:\n")
    flow_file.write(
        f"{base_ind}{ind}{ind}self.__dict__[key] = self.__dict__[key] + results[key]\n"
    )
    flow_file.write(f"{base_ind}{ind}else:\n")
    flow_file.write(f"{base_ind}{ind}{ind}self.__dict__[key] = results[key]\n")

In [None]:
from io import StringIO

flow_stream = StringIO()
params_meta = {
    "foo": ParamMeta(
        instance_type=str,
        is_scalar=True,
        is_json_type=False,
        persist_type="pickle",
        has_metaflow_param=True,
        has_sagemaker_param=True,
    )
}
steps = [
    FuncDetails(
        "first",
        docstring="none",
        args="foo",
        has_return=True,
        return_stmt="results={'bar': 1}}",
        code="",
    ),
    FuncDetails(
        "last",
        docstring="bla",
        args="bar",
        has_return=True,
        return_stmt="results={'bar': 2}}",
        code="bar = [1,2,3]",
    ),
]
write_steps(flow_stream, steps, ["first", "last"], params_meta, "    ")
flow_stream.seek(0)

0

In [None]:
expected = """
@step
    def first(self):
        \"\"\"none\"\"\"
        results = first(self.foo)
        for key in results.keys():
            if key in self.__dict__:
                self.__dict__[key] = self.__dict__[key] + results[key]
            else:
                self.__dict__[key] = results[key]
        self.next(self.last)

    @step
    def last(self):
        \"\"\"bla\"\"\"
        results = last(self.bar)
        for key in results.keys():
            if key in self.__dict__:
                self.__dict__[key] = self.__dict__[key] + results[key]
            else:
                self.__dict__[key] = results[key]
"""

In [None]:
assert flow_stream.read().strip(" \n") == expected.strip(" \n")

In [None]:
expected_lines = """    
for key in results.keys():
    if key in self.__dict__:
        self.__dict__[key] = self.__dict__[key] + results[key]
    else:
        self.__dict__[key] = results[key]
""".strip(
    " \n"
)
with tempfile.TemporaryDirectory() as temp_dir:
    with open(Path(temp_dir, "flow"), "w") as flow_file:
        write_track_capture(flow_file, "    ", 0)
    with open(Path(temp_dir, "flow"), "r") as written_file:
        file_lines = written_file.readlines()
assert "".join(file_lines).strip("\n") == expected_lines

# Test Flow Generation

In [None]:
assert Path(test_dir, "flows", "metaflow", f"test_data_handling.py") == get_flow_path(
    Path(test_dir, f"test_data_handling.ipynb")
)

In [None]:
prepare_env()

In [None]:
nb_path = Path(test_dir, "test_multistep.ipynb")
nb = read_nb(nb_path)
module_name = find_default_export(nb["cells"]).replace(".", "/")
test_module = os.path.join(get_config().path("lib_path"), f"{module_name}.py")
steps = extract_steps(test_module)

In [None]:
nb_to_metaflow(nb_path, get_flow_path(nb_path), silent=False)

2023-10-31 15:53:02,729 [MainThread  ] [DEBUG]  Ignoring conversion for nb with no named steps: /home/sagemaker-user/git/sciflow/nbs/test/test_multistep.ipynb


In [None]:
test_nb_path = Path(test_dir, "test_multistep.ipynb")
test_flow_path = get_flow_path(Path(test_dir, "test_multistep.ipynb"))
if test_flow_path.exists():
    test_flow_path.unlink()
assert not test_flow_path.exists()

In [None]:
nb_to_metaflow(
    Path(test_dir, "test_multistep.ipynb"),
    get_flow_path(Path(test_dir, "test_multistep.ipynb")),
    silent=False,
)

Converted test_multistep.ipynb to TestMultistepFlow in: test_multistep.py


In [None]:
assert test_flow_path.exists()

AssertionError: 

# Ignore notebooks without Sciflow steps

In [None]:
no_steps_path = Path("packaging.ipynb")
assert not get_flow_path(Path(nbs_dir, "packaging.ipynb")).exists()

In [None]:
nb_to_metaflow(
    Path(nbs_dir, "packaging.ipynb"),
    get_flow_path(Path(nbs_dir, "packaging.ipynb")),
    silent=False,
)

In [None]:
assert not get_flow_path(Path(nbs_dir, "packaging.ipynb")).exists()

# Multiple Flow Generation

## `generate_flows`

In [None]:
# | export


def generate_flows(config=None, clear_dir=True):
    metaflows_dir = Path(get_config().path("flows_path"), "metaflow")
    if not metaflows_dir.exists():
        metaflows_dir.mkdir(parents=True)
    if clear_dir:
        [f.unlink() for f in metaflows_dir.iterdir() if not f.is_dir()]
    nb_paths = nbglob()
    for nb_path in nb_paths:
        nb_to_metaflow(
            nb_path,
            get_flow_path(nb_path, config=config),
            silent=False,
        )

In [None]:
generate_flows()

Converted test_export.ipynb to TestExportFlow in: test_export.py
Converted test_module.ipynb to TestModuleFlow in: test_module.py
No params cell found for: test_multistep_no_params.ipynb
Converted test_multistep_no_params.ipynb to TestMultistepNoParamsFlow in: test_multistep_no_params.py
Converted test_data_handling.ipynb to TestDataHandlingFlow in: test_data_handling.py


# CLI Commands

## `sciflow_metaflow`

In [None]:
# | export


@call_parse
def sciflow_metaflow(log_level: str = "warn"):
    configure_logging(log_level)
    generate_flows(config=get_config())

In [None]:
rootLogger = logging.getLogger()
rootLogger.handlers = []

In [None]:
sciflow_metaflow(log_level="info")

Converted test_export.ipynb to TestExportFlow in: test_export.py
Converted test_module.ipynb to TestModuleFlow in: test_module.py
No params cell found for: test_multistep_no_params.ipynb
Converted test_multistep_no_params.ipynb to TestMultistepNoParamsFlow in: test_multistep_no_params.py
Converted test_data_handling.ipynb to TestDataHandlingFlow in: test_data_handling.py
