In [1]:
# default_exp to_sagemaker

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# export


import os
from pathlib import Path
from typing import Iterable

from nbdev.export import find_default_export, get_config, read_nb

from sciflow.data_handler import extract_param_meta
from sciflow.params import params_as_dict
from sciflow.parse_module import FuncDetails, extract_steps

In [4]:
from sciflow.to_metaflow import (
    extract_module_only,
    get_flow_path,
    titleize,
)

# Sciflow Notebook to Sagemaker Pipeline

> Converts from a `sciflow` format notebook to a `sagemaker` pipeline. 

Currently Supported features:

* Linear/sequential DAGs
* Simple `Parameters`

In [5]:
nb_path = Path(os.path.join("test", "test_clustering.ipynb"))
nb = read_nb(nb_path)
module_name = find_default_export(nb["cells"]).replace(".", "/")
test_module = os.path.join(get_config().path("lib_path"), f"{module_name}.py")

In [6]:
flow_path = get_flow_path(nb_path, flow_provider="sagemaker")

In [7]:
flow_path

Path('/home/sagemaker-user/git/sciflow/nbs/test/flows/sagemaker/test_clustering.py')

In [8]:
steps = extract_steps(test_module)

In [9]:
# export


def nb_to_sagemaker_pipeline(
    nb_path: Path, flow_path: Path, silent=True, track_experiment=True
):
    nb = read_nb(nb_path)
    lib_name = get_config().get("lib_name")
    module_name = find_default_export(nb["cells"])
    if not module_name:
        return
    module_name = module_name
    path_sep_module_name = module_name.replace(".", "/")
    nb_name = os.path.basename(nb_path)
    exported_module = os.path.join(
        get_config().path("lib_path"), f"{path_sep_module_name}.py"
    )
    steps = extract_steps(exported_module)
    if len(steps) == 0:
        print("Skipping sagemaker conversion - not steps found")
        return
    params = params_as_dict(nb_path)
    if len(params) == 0:
        print(f"No params cell found for: {os.path.basename(nb_path)}")
    pipeline_class_name = f"{titleize(extract_module_only(module_name))}Pipeline"
    write_pipeline_to_files(
        flow_path,
        pipeline_class_name,
        lib_name,
        module_name,
        steps,
        params,
        track_experiment,
    )
    if not silent:
        print(
            f"Converted {nb_name} to {pipeline_class_name} in: {os.path.basename(flow_path)}"
        )

In [10]:
# export

def is_train_step(step):
    return any(step.name.startswith(prefix) for prefix in ('fit', 'train'))

def is_processing_step(step):
    return not is_train_step(step)

In [11]:
training_step = FuncDetails('fit_something', docstring=None, args=None, 
                            has_return=False, return_stmt=None, code='')
proc_step = FuncDetails('blabla', docstring=None, args=None, 
                            has_return=False, return_stmt=None, code='')

In [12]:
assert(not is_processing_step(training_step))
assert(is_train_step(training_step))
assert(is_processing_step(proc_step))
assert(not is_train_step(proc_step))

In [13]:
lib_name = get_config().get("lib_name")
module_name = find_default_export(nb["cells"])
fq_module_name = f"{lib_name}.{module_name}"

In [14]:
#  export


def write_pipeline_to_files(
    flow_path: Path,
    pipeline_class_name: str,
    lib_name: str,
    module_name: str,
    steps: Iterable[FuncDetails],
    params: dict,
    track_experiment: bool,
):
    if not os.path.exists(flow_path.parent):
        os.mkdir(flow_path.parent)
    fq_module_name = f"{lib_name}.{module_name}"
    param_meta = extract_param_meta(fq_module_name, params)
    with open(flow_path, "w") as flow_file:
        flow_file.write("#!/usr/bin/env python\n")
        flow_file.write("# coding=utf-8\n")
        flow_file.write("# SCIFLOW GENERATED FILE - EDIT COMPANION NOTEBOOK\n")

        flow_file.write("import os\n")

        flow_file.write("import sagemaker\n")
        flow_file.write("from sagemaker.session import Session\n")
        flow_file.write("from sagemaker.workflow.pipeline import Pipeline\n")

        has_train_step = any(
            [is_train_step(s) for s in steps]
        )
        has_processing_step = sum(
            [is_processing_step(s) for s in steps]
        ) != len(steps)

        if has_train_step and has_processing_step:
            flow_file.write(
                "from sagemaker.workflow.steps import ProcessingStep, TrainingStep\n"
            )
        if has_processing_step:
            flow_file.write(
                "from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput\n"
            )
            flow_file.write("from sagemaker.workflow.pipeline import Pipeline\n")
        if has_train_step:
            flow_file.write("from sagemaker.inputs import TrainingInput\n")
            flow_file.write("from sagemaker.estimator import Estimator\n")

        has_sm_param = any((p.has_sagemaker_param for p in param_meta.values()))
        if has_sm_param:
            instance_types = [p.instance_type for p in param_meta.values()]
            sm_params_import = "from sagemaker.workflow.parameters import "
            if int in instance_types:
                sm_params_import += "ParameterInteger"
                if float in instance_types or str in instance_types:
                    sm_params_import += ", "
            if float in instance_types:
                sm_params_import += "ParameterFloat"
                if str in instance_types:
                    sm_params_import += ", "
            if str in instance_types:
                sm_params_import += "ParameterString"

            flow_file.write(sm_params_import + "\n")

        flow_file.write("\n")
        flow_file.write(
            f"from {fq_module_name} import {', '.join([s.name for s in steps])}\n"
        )
        if len(params) > 0:
            flow_file.write(
                f"from {fq_module_name} import {', '.join(params.keys())}\n"
            )

        flow_file.write(f"\n\nclass {pipeline_class_name}():\n")
        single_indent = "    "
        write_params(flow_file, param_meta, single_indent)
        flow_file.write("\n")
        write_steps(
            fq_module_name,
            flow_file,
            steps,
            param_meta,
            single_indent,
            track_experiment,
        )
        flow_file.write("\n")

        flow_file.write('if __name__ == "__main__":\n')
        flow_file.write(f"{single_indent}{pipeline_class_name}()")

In [15]:
# export


def write_observers(lib_name, flow_file, module_name, bucket_name, project):
    pass

In [16]:
# export


def write_track_flow(flow_file, track_experiment):
    pass

In [17]:
#  export


def write_params(flow_file, param_meta, single_indent):
    for param in param_meta.keys():
        if param_meta[param].instance_type == int:
            flow_file.write(
                f"{single_indent}{param} = ParameterInteger(name='{param}', default_value={param})\n"
            )
        elif param_meta[param].instance_type == float:
            flow_file.write(
                f"{single_indent}{param} = ParameterFloat(name='{param}', default_value={param})\n"
            )
        elif param_meta[param].instance_type == str:
            flow_file.write(
                f"{single_indent}{param} = ParameterString(name='{param}', default_value={param})\n"
            )

In [18]:
def write_script_processor(flow_file):
    flow_file.write(
        """
        script_processor = ScriptProcessor(
                command=['python3'],
                image_uri="141502667606.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3",
                role=self.role,
                instance_count=1,
                instance_type="ml.m5.xlarge",
                sagemaker_session=self.sagemaker_session,
                env={'AWS_DEFAULT_REGION': self.region},
                base_job_name=f'processing-job/{__file__}'
        )
        """
    )

In [19]:
# export

def group_args(step, param_names, flow_scope):
    if len(step.args) > 0:
        args = [x.strip() for x in step.args.split(",")]
        input_vars = [a for a in args if a in param_names]
        created_vars = [a for a in args if a in flow_scope]
        unscoped_vars = set(args).difference(set(input_vars + created_vars))
        if len(unscoped_vars) > 0:
            raise ValueError(f'Step: {step.name} depends on variable(s), "{unscoped_vars}"  ,which not in the flow scope')
        return input_vars, created_vars

In [22]:
no_arg_step = FuncDetails('no_arg_step', docstring=None, args='', 
                            has_return=False, return_stmt=None, code='')
single_arg_step = FuncDetails('fit_single_arg_step', docstring=None, args="one_param", 
                            has_return=False, return_stmt=None, code='')
multi_arg_step = FuncDetails('multi_arg_step', docstring=None, args="one_param,two_param", 
                            has_return=False, return_stmt=None, code='')

In [33]:
assert group_args(no_arg_step, ['one_param'], ['two_param']) is None
assert group_args(no_arg_step, [], []) is None
raised = False
try:
    group_args(single_arg_step, [], []) is None
except:
    raised = True
assert raised
assert group_args(single_arg_step, ['one_param'], ['two_param']) == (['one_param'], [])
assert group_args(multi_arg_step, ['one_param'], ['two_param']) == (['one_param'], ['two_param'])
raised = False
try:
    assert group_args(multi_arg_step, ['one_param'], [])
except:
    raised = True
assert raised

In [109]:
# export

def format_job_arguments(param_meta):
    job_arg_values = [
        f"str(self.{p}.__int__())" if param_meta[p].instance_type == int else 
        f"str(self.{p}.__float__())" if param_meta[p].instance_type == float else
        f"self.{p}.__str__()" if param_meta[p].instance_type == str else
        f"str(self.{p})" for p in param_meta.keys()
    ]
    stitched_args = list(zip([f"--{p}" for p in param_meta.keys()], job_arg_values))
    flattened = [item for sublist in stitched_args for item in sublist]
    return flattened

In [111]:
from sciflow.data_handler import ParamMeta
from dataclasses import dataclass
@dataclass
class MockParamMeta:
    instance_type: type

assert (
    ['--str_param', 'self.str_param.__str__()'] ==
    format_job_arguments({"str_param": MockParamMeta(str)})
)
assert (
    ['--int_param', "str(self.int_param.__int__())",
    '--float_param', "str(self.float_param.__float__())"] ==
    format_job_arguments(
        {
            "int_param": MockParamMeta(int),
            "float_param": MockParamMeta(float) 
        }
    )
)
assert (
    ['--int_param', "str(self.int_param.__int__())"] ==
    format_job_arguments({"int_param": MockParamMeta(int)})
)

In [128]:
# export


def format_arg(arg, param_meta):
    if arg in param_meta and not param_meta[arg].has_metaflow_param:
        result = arg
    else:
        result = "self." + arg
    return result


def write_steps(
    fq_module_name, flow_file, steps, param_meta, single_indent, track_experiment
):
    param_names = list(param_meta.keys())
    flow_scope = []
    outputs = {}
    
    for i, step in enumerate(steps):
        return_vars = get_return_var_names(step)
        args_by_class = group_args(step, param_names, flow_scope)
        if args_by_class is not None:
            input_vars, created_vars = args_by_class
            
        flow_file.write(f"{single_indent}def {step.name}(self):\n")
        if step.docstring:
            flow_file.write(f"{indent_multiline(step.docstring, 2)}\n")
        # Processing step
        if is_processing_step(step):
            write_script_processor(flow_file)

            flow_file.write("\n")
            flow_file.write(
                f"{single_indent}{single_indent}{step.name}_step = ProcessingStep(\n"
            )
            flow_file.write(
                f"{single_indent}{single_indent}{single_indent}name={step.name}\n"
            )
            flow_file.write(
                f"{single_indent}{single_indent}{single_indent}processor = script_processor\n"
            )
            print(f"job_arguments={format_job_arguments(param_meta)}")
            flow_file.write(
                f"{single_indent}{single_indent}{single_indent}code = {fq_module_name}_{step.name}.py\n"
            )
            flow_file.write(f"{single_indent}{single_indent})\n")
        else:
            # Training step
            pass
        flow_file.write(f"self.{step.name}_step = {step.name}_step\n")
        flow_file.write("\n")

In [129]:
# export


def create_sm_dag(steps, param_meta):
    param_names = list(param_meta.keys())
    flow_scope = []
    outputs = {}
    for step in steps:
        return_vars = get_return_var_names(step)
        args_by_class = classify_args(step, param_names, flow_scope)
        if args_by_class is not None:
            input_vars, created_vars = args_by_class
        if is_processing_step(step):
            print(f"Step: {step.name}")
            #if len(step.args) > 0:
            #    print(f"Step Args: {step.args}")
            if args_by_class is not None:
                if len(input_vars) > 0:
                    print(f"Job arguments: {input_vars}")
                #print(f"Args that are in flow scope: {created_vars}")
                if len(created_vars) > 0:
                    print(f"ProcessingInputs: {[outputs[cv] for cv in created_vars]}")
            proc_outs = {(v, f'step_{step.name}.properties.ProcessingOutputConfig.Outputs["{v}"].S3Output.S3Uri') 
                               for v in return_vars}
            outputs.update(proc_outs)
            if len(proc_outs) > 0:
                print(f"ProcessingOutputs: {proc_outs}")
        elif is_train_step(step):
            print(f"Step: {step.name}")
            #if len(step.args) > 0:
            #    print(f"Step Args: {step.args}")
            if args_by_class is not None:
                if len(input_vars) > 0:
                    print(f"Hyperparameters: {input_vars}")
                #print(f"Args that are params: {input_vars}")
                #print(f"Args that are in flow scope: {created_vars}")
                if len(created_vars) > 0:
                    print(f"TrainingInputs: {[outputs[cv] for cv in created_vars]}")
            train_outs = {(v, f'step_{step.name}.properties.ModelArtifacts.S3ModelArtifacts') 
                               for v in return_vars}
            outputs.update(train_outs)
        flow_scope.extend(return_vars)
        print("\n")
        
    # ProcessingInput comes from params or from preceding ste

In [130]:
# export


def write_track_capture(flow_file):
    flow_file.write(
        f"""
        for key in results.keys():
            if key in self.__dict__:
                self.__dict__[key] = self.__dict__[key] + results[key]
            else:
                self.__dict__[key] = results[key]

"""
    )

In [131]:
# export


def get_return_var_names(step):
    results_index = step.code.find("results =")
    if results_index == -1:
        return []
    return [
        l.split(":")[1].strip(", \}")
        for l in step.code[results_index:].split("\n")
        if l.strip().find(":") > -1
    ]

In [132]:
params = params_as_dict(nb_path)
param_meta = extract_param_meta(fq_module_name, params)

In [133]:
param_names = list(param_meta.keys())

In [134]:
create_sm_dag(steps, param_meta)

NameError: name 'classify_args' is not defined

In [135]:
nb_to_sagemaker_pipeline(nb_path, flow_path, silent=False, track_experiment=False)

job_arguments=['--traffic_percent', 'str(self.traffic_percent.__int__())', '--workers', 'str(self.workers.__int__())', '--model_level', 'self.model_level.__str__()', '--min_date', 'self.min_date.__str__()']
job_arguments=['--traffic_percent', 'str(self.traffic_percent.__int__())', '--workers', 'str(self.workers.__int__())', '--model_level', 'self.model_level.__str__()', '--min_date', 'self.min_date.__str__()']


ValueError: Step: fit depends on variable(s), "{'documents'}"  ,which not in the flow scope