In [1]:
# default_exp to_sagemaker

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# export


import os
from pathlib import Path
from typing import Iterable

from nbdev.export import find_default_export, get_config, read_nb

from sciflow.data_handler import extract_param_meta
from sciflow.params import params_as_dict
from sciflow.parse_module import FuncDetails, extract_steps

In [4]:
from sciflow.to_metaflow import (
    extract_module_only,
    get_flow_path,
    titleize,
)

# Sciflow Notebook to Sagemaker Pipeline

> Converts from a `sciflow` format notebook to a `sagemaker` pipeline. 

Currently Supported features:

* Linear/sequential DAGs
* Simple `Parameters`

In [5]:
nb_path = Path(os.path.join("test", "test_clustering.ipynb"))
nb = read_nb(nb_path)
module_name = find_default_export(nb["cells"]).replace(".", "/")
test_module = os.path.join(get_config().path("lib_path"), f"{module_name}.py")

In [6]:
flow_path = get_flow_path(nb_path, flow_provider="sagemaker")

In [7]:
flow_path

Path('/home/sagemaker-user/git/sciflow/nbs/test/flows/sagemaker/test_clustering.py')

In [8]:
steps = extract_steps(test_module)

In [9]:
# export


def nb_to_sagemaker_pipeline(
    nb_path: Path, flow_path: Path, silent=True, track_experiment=True
):
    nb = read_nb(nb_path)
    lib_name = get_config().get("lib_name")
    module_name = find_default_export(nb["cells"])
    if not module_name:
        return
    module_name = module_name
    path_sep_module_name = module_name.replace(".", "/")
    nb_name = os.path.basename(nb_path)
    exported_module = os.path.join(
        get_config().path("lib_path"), f"{path_sep_module_name}.py"
    )
    steps = extract_steps(exported_module)
    if len(steps) == 0:
        print("Skipping sagemaker conversion - not steps found")
        return
    params = params_as_dict(nb_path)
    if len(params) == 0:
        print(f"No params cell found for: {os.path.basename(nb_path)}")
    pipeline_class_name = f"{titleize(extract_module_only(module_name))}Pipeline"
    write_pipeline_to_files(
        flow_path,
        pipeline_class_name,
        lib_name,
        module_name,
        steps,
        params,
        track_experiment,
    )
    if not silent:
        print(
            f"Converted {nb_name} to {pipeline_class_name} in: {os.path.basename(flow_path)}"
        )

In [57]:
# export

def is_train_step(step):
    return any(step.name.startswith(prefix) for prefix in ('fit', 'train'))

def is_processing_step(step):
    return not is_train_step(step)

In [64]:
training_step = FuncDetails('fit_something', docstring=None, args=None, 
                            has_return=False, return_stmt=None, code='')
proc_step = FuncDetails('blabla', docstring=None, args=None, 
                            has_return=False, return_stmt=None, code='')

In [65]:
assert(not is_processing_step(training_step))
assert(is_train_step(training_step))
assert(is_processing_step(proc_step))
assert(not is_train_step(proc_step))

In [72]:
lib_name = get_config().get("lib_name")
module_name = find_default_export(nb["cells"])
fq_module_name = f"{lib_name}.{module_name}"

In [66]:
#  export


def write_pipeline_to_files(
    flow_path: Path,
    pipeline_class_name: str,
    lib_name: str,
    module_name: str,
    steps: Iterable[FuncDetails],
    params: dict,
    track_experiment: bool,
):
    if not os.path.exists(flow_path.parent):
        os.mkdir(flow_path.parent)
    fq_module_name = f"{lib_name}.{module_name}"
    param_meta = extract_param_meta(fq_module_name, params)
    with open(flow_path, "w") as flow_file:
        flow_file.write("#!/usr/bin/env python\n")
        flow_file.write("# coding=utf-8\n")
        flow_file.write("# SCIFLOW GENERATED FILE - EDIT COMPANION NOTEBOOK\n")

        flow_file.write("import os\n")

        flow_file.write("import sagemaker\n")
        flow_file.write("from sagemaker.session import Session\n")
        flow_file.write("from sagemaker.workflow.pipeline import Pipeline\n")

        has_train_step = any(
            [is_train_step(s) for s in steps]
        )
        has_processing_step = sum(
            [is_processing_step(s) for s in steps]
        ) != len(steps)

        if has_train_step and has_processing_step:
            flow_file.write(
                "from sagemaker.workflow.steps import ProcessingStep, TrainingStep\n"
            )
        if has_processing_step:
            flow_file.write(
                "from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput\n"
            )
            flow_file.write("from sagemaker.workflow.pipeline import Pipeline\n")
        if has_train_step:
            flow_file.write("from sagemaker.inputs import TrainingInput\n")
            flow_file.write("from sagemaker.estimator import Estimator\n")

        has_sm_param = any((p.has_sagemaker_param for p in param_meta.values()))
        if has_sm_param:
            instance_types = [p.instance_type for p in param_meta.values()]
            sm_params_import = "from sagemaker.workflow.parameters import "
            if int in instance_types:
                sm_params_import += "ParameterInteger"
                if float in instance_types or str in instance_types:
                    sm_params_import += ", "
            if float in instance_types:
                sm_params_import += "ParameterFloat"
                if str in instance_types:
                    sm_params_import += ", "
            if str in instance_types:
                sm_params_import += "ParameterString"

            flow_file.write(sm_params_import + "\n")

        flow_file.write("\n")
        flow_file.write(
            f"from {fq_module_name} import {', '.join([s.name for s in steps])}\n"
        )
        if len(params) > 0:
            flow_file.write(
                f"from {fq_module_name} import {', '.join(params.keys())}\n"
            )

        flow_file.write(f"\n\nclass {pipeline_class_name}():\n")
        single_indent = "    "
        write_params(flow_file, param_meta, single_indent)
        flow_file.write("\n")
        write_steps(
            fq_module_name,
            flow_file,
            steps,
            param_meta,
            single_indent,
            track_experiment,
        )
        flow_file.write("\n")

        flow_file.write('if __name__ == "__main__":\n')
        flow_file.write(f"{single_indent}{pipeline_class_name}()")

In [11]:
# export


def write_observers(lib_name, flow_file, module_name, bucket_name, project):
    pass

In [12]:
# export


def write_track_flow(flow_file, track_experiment):
    pass

In [13]:
#  export


def write_params(flow_file, param_meta, single_indent):
    for param in param_meta.keys():
        if param_meta[param].instance_type == int:
            flow_file.write(
                f"{single_indent}{param} = ParameterInteger(name='{param}', default_value={param})\n"
            )
        elif param_meta[param].instance_type == float:
            flow_file.write(
                f"{single_indent}{param} = ParameterFloat(name='{param}', default_value={param})\n"
            )
        elif param_meta[param].instance_type == str:
            flow_file.write(
                f"{single_indent}{param} = ParameterString(name='{param}', default_value={param})\n"
            )

In [14]:
def write_script_processor(flow_file):
    flow_file.write(
        """
        script_processor = ScriptProcessor(
                command=['python3'],
                image_uri="141502667606.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3",
                role=self.role,
                instance_count=1,
                instance_type="ml.m5.xlarge",
                sagemaker_session=self.sagemaker_session,
                env={'AWS_DEFAULT_REGION': self.region},
                base_job_name=f'processing-job/{__file__}'
        )
        """
    )

In [15]:
# export


def format_arg(arg, param_meta):
    if arg in param_meta and not param_meta[arg].has_metaflow_param:
        result = arg
    else:
        result = "self." + arg
    return result


def write_steps(
    fq_module_name, flow_file, steps, param_meta, single_indent, track_experiment
):
    [s.name for s in steps]
    for i, step in enumerate(steps):
        flow_file.write(f"{single_indent}def {step.name}(self):\n")
        if step.docstring:
            flow_file.write(f"{indent_multiline(step.docstring, 2)}\n")
        # Processing step
        if not any([step.name.startswith(n) for n in ("fit", "train")]):
            write_script_processor(flow_file)

            flow_file.write("\n")
            flow_file.write(
                f"{single_indent}{single_indent}{step.name}_step = ProcessingStep(\n"
            )
            flow_file.write(
                f"{single_indent}{single_indent}{single_indent}name={step.name}\n"
            )
            flow_file.write(
                f"{single_indent}{single_indent}{single_indent}processor = script_processor\n"
            )
            flow_file.write(
                f"{single_indent}{single_indent}{single_indent}code = {fq_module_name}_{step.name}.py\n"
            )
            flow_file.write(f"{single_indent}{single_indent})\n")
        else:
            # Trainign step
            pass
        flow_file.write("\n")

In [16]:
# export


def write_track_capture(flow_file):
    flow_file.write(
        f"""
        for key in results.keys():
            if key in self.__dict__:
                self.__dict__[key] = self.__dict__[key] + results[key]
            else:
                self.__dict__[key] = results[key]

"""
    )

In [17]:
[(s.name, s.args) for s in steps]

[('something', ''),
 ('preprocess', 'model_level,min_date,traffic_percent'),
 ('fit', 'documents,workers'),
 ('evaluate', 'model')]

In [18]:
steps[-1].code.find("results = ")

950

In [19]:
# export


def get_return_var_names(step):
    results_index = step.code.find("results =")
    if results_index == -1:
        return []
    return [
        l.split(":")[1].strip(", \}")
        for l in step.code[results_index:].split("\n")
        if l.strip().find(":") > -1
    ]

In [20]:
print(steps[0].code)



def something():
    print("The first step")




In [21]:
print(steps[1].code)



def preprocess(model_level=None, min_date=None, traffic_percent=100):
    data = get_utterances(model_level, min_date, traffic_percent)
    documents = data.tolist()
    results = {"documents": documents}
    return results




In [22]:
print(steps[2].code)



def fit(documents, workers=workers):
    model = Topics(documents, workers=workers)
    results = {"model": model}
    return results




In [23]:
[get_return_var_names(s) for s in steps]

[[], ['documents'], ['model'], ['word_summaries', 'artifacts', 'metrics']]

In [24]:
import json

In [25]:
json.dumps(
    {"word_summaries": "word_summaries", "artifacts": "artifacts", "metrics": "metrics"}
)

'{"word_summaries": "word_summaries", "artifacts": "artifacts", "metrics": "metrics"}'

In [26]:
json.loads(
    '{"word_summaries": "word_summaries","artifacts": "artifacts","metrics": "metrics"}'
)

{'word_summaries': 'word_summaries',
 'artifacts': 'artifacts',
 'metrics': 'metrics'}

In [37]:
steps

['FuncDetails(name=something,args=,has_return=False):\ndef something():\n    print("The first step")',
 ('FuncDetails(name=preprocess,args=model_level,min_date,traffic_percent,has_return=True):\n'
  'def preprocess(model_level=None, min_date=None, traffic_percent=100):\n'
  '    data = get_utterances(model_level, min_date, traffic_percent)\n'
  '    documents = data.tolist()\n'
  '    results = {"documents": documents}\n'
  '    return results'),
 ('FuncDetails(name=fit,args=documents,workers,has_return=True):\n'
  'def fit(documents, workers=workers):\n'
  '    model = Topics(documents, workers=workers)\n'
  '    results = {"model": model}\n'
  '    return results'),
 ('FuncDetails(name=evaluate,args=model,has_return=True):\n'
  'def evaluate(model):\n'
  '    topic_words, word_scores, topic_nums = model.get_topics(model.get_num_topics())\n'
  '\n'
  '    topic_contains_non_empty_words = all([len(tw) > 0 for tw in topic_words])\n'
  '    word_scores_in_range = word_scores.min() >= 0.0

In [172]:
def classify_args(step, param_names, flow_scope):
    if len(step.args) > 0:
        args = [x.strip() for x in step.args.split(",")]
        input_vars = [a for a in args if a in param_names]
        created_vars = [a for a in args if a in flow_scope]
        #print(args)
        #print(input_vars + created_vars)
        unscoped_vars = set(args).difference(set(input_vars + created_vars))
        #print(unscoped_vars)
        if len(unscoped_vars) > 0:
            raise ValueError(f'Step: {step.name} depends on variable(s), "{unscoped_vars}"  ,which not in the flow scope')
        return input_vars, created_vars

In [173]:
classify_args(steps[1], param_names, flow_scope)

(['model_level', 'min_date', 'traffic_percent'], [])

In [186]:
# export


def get_sm_step_chain(steps, param_meta):
    param_names = list(param_meta.keys())
    flow_scope = []
    for step in steps:
        return_vars = get_return_var_names(step)
        args_by_class = classify_args(step, param_names, flow_scope)
        if args_by_class is not None:
            input_vars, created_vars = args_by_class
        if is_processing_step(step):
            print(f"Step: {step.name}")
            if len(step.args) > 0:
                print(f"Step Args: {step.args}")
            if args_by_class is not None:
                print(f"Args that are params: {input_vars}")
                print(f"Args that are in flow scope: {created_vars}")
                print(f"ProcessingInputs: {created_vars}")
            print(f"ProcessingOutputs: {return_vars}")
        elif is_train_step(step):
            print(f"Step: {step.name}")
            if len(step.args) > 0:
                print(f"Step Args: {step.args}")
            if args_by_class is not None:
                print(f"Args that are params: {input_vars}")
                print(f"Args that are in flow scope: {created_vars}")
                print(f"TrainingInputs: {created_vars}")
        print("\n")
        flow_scope.extend(return_vars)
        
    # ProcessingInput comes from params or from preceding steps
    # To put it in scope need full URI
        
        
        
    # if is processing step
    # determine processing inputs
    # determine processing outputs
    # pass params as job_arguments - if needed
    # args are either processinginputs - which can be returned from another step function or loaded from a training step
    # walk through steps?
    # for each step what do you need?
    # look at args
    # look at return vars
    # look at params
    # match by name

In [187]:
params = params_as_dict(nb_path)
param_meta = extract_param_meta(fq_module_name, params)

In [188]:
param_names = list(param_meta.keys())

In [189]:
get_sm_step_chain(steps, param_meta)

Step: something
ProcessingOutputs: []


Step: preprocess
Step Args: model_level,min_date,traffic_percent
Args that are params: ['model_level', 'min_date', 'traffic_percent']
Args that are in flow scope: []
ProcessingInputs: []
ProcessingOutputs: ['documents']


Step: fit
Step Args: documents,workers
Args that are params: ['workers']
Args that are in flow scope: ['documents']
TrainingInputs: ['documents']


Step: evaluate
Step Args: model
Args that are params: []
Args that are in flow scope: ['model']
ProcessingInputs: ['model']
ProcessingOutputs: ['word_summaries', 'artifacts', 'metrics']




In [165]:
# [
#     ('something', ''),
#     ('preprocess', 'model_level,min_date,traffic_percent',
#      {
#          'processing_outputs': ['documents'],
#          'job_arguments:' ['model_level,min_date,traffic_percent']
#      }
#     ),
#     ('fit', 'documents,workers',
#         {
#           'training_inputs': ['documents']
#         }
#     ),
#     ('evaluate', 'model',
#      {
#          'processing_inputs': ['fit_model'],
#          'processing_outputs': ['word_summaries']
#      }
#     )
# ]

In [35]:
nb_to_sagemaker_pipeline(nb_path, flow_path, silent=False, track_experiment=False)

Converted test_clustering.ipynb to TestClusteringPipeline in: test_clustering.py
