In [None]:
# | include: false
# | default_exp params

# Imports

In [None]:
# | export

import os
import sys
from dataclasses import dataclass
from importlib import import_module
from io import StringIO
from pathlib import Path
from typing import Any, Dict, Iterable, List

import nbformat
import numpy as np
import pandas as pd
from execnb.nbio import read_nb
from nbdev.config import get_config
from nbdev.doclinks import nbglob
from nbformat.notebooknode import NotebookNode

In [None]:
%load_ext autoreload
%autoreload 2

# Basic Papermill Params Utilities

## `find_params_cell`

In [None]:
# | export


def find_params_cell(nb: NotebookNode):
    params_cell = [c for c in nb["cells"] if c["metadata"] == {"tags": ["parameters"]}]
    return params_cell

In [None]:
test_nb = os.path.join(Path(".").resolve(), "test", "test_export.ipynb")

In [None]:
assert len(find_params_cell(read_nb(Path(test_nb)))) == 1
assert len(find_params_cell(read_nb(Path("index.ipynb")))) == 0

In [None]:
# | export

DEFAULT_PARAMS_CELL = {
    "cell_type": "code",
    "execution_count": None,
    "metadata": {"tags": ["parameters"]},
    "outputs": [],
    "source": "# parameters\n",
}

## `add_missing_params_cell`

In [None]:
# | export


def add_missing_params_cell(nb_path: Path, persist: bool = True):
    nb = read_nb(nb_path)
    if len(find_params_cell(nb)) > 0:
        print(f"Skipping {nb_path} already has parameters cell")
        return
    nb["cells"].insert(0, nbformat.from_dict(DEFAULT_PARAMS_CELL))
    if persist:
        nbformat.write(nb, nb_path)
    return nb

In [None]:
with_params = os.path.join(Path(".").resolve(), "test", "test_multistep.ipynb")
without_params = os.path.join(
    Path(".").resolve(), "test", "test_multistep_no_params.ipynb"
)

add_missing_params_cell(with_params, False)
assert len(find_params_cell(read_nb(without_params))) == 0
parameterised_nb = add_missing_params_cell(without_params, False)
assert len(find_params_cell(parameterised_nb)) == 1

Skipping /home/sagemaker-user/git/sciflow/nbs/test/test_multistep.ipynb already has parameters cell


# Parameter Extraction

## `extract_params`

In [None]:
# | export


def extract_params(nb: NotebookNode):
    params_cell = find_params_cell(nb)
    return params_cell[0]["source"] if len(params_cell) > 0 else None

In [None]:
params_code = extract_params(read_nb(Path(test_nb)))
assert params_code.replace(" ", "").startswith("#|export")
assert "some_param" in params_code
assert "some_params" in params_code
assert "input_path" in params_code
assert "model_path" in params_code

## `extract_params_to_file`

In [None]:
# | export


def extract_params_to_file(nb_path: Path, params_file_path: Path):
    params_code = extract_params(read_nb(Path(test_nb)))
    with open(params_file_path, "w") as params_file:
        params_file.writelines(params_code)

In [None]:
extract_params_to_file(
    test_nb,
    os.path.join(get_config().path("lib_path"), "test", "test_export_params.py"),
)

## `list_mod_files`

In [None]:
# | export


def list_mod_files(files):
    modules = []
    for f in files:
        fname = Path(f)
        nb = read_nb(fname)
        default = find_default_export(nb["cells"])
        if default is not None:
            default = os.path.sep.join(default.split("."))
            modules.append(default)
    return modules

## `extract_params_to_file`

In [None]:
# | export


def extract_as_files(suffix="_params.py"):
    nbs = nbglob(recursive=True)
    param_files = list_mod_files(nbs)
    params_files = [
        Path(os.path.join(get_config().path("lib_path"), pf + suffix))
        for pf in param_files
    ]
    for nb_path, pf_path in zip(nbs, params_files):
        extract_params_to_file(nb_path, pf_path)

## `_lines_to_dict`

In [None]:
# | exporti


def _lines_to_dict(lines: Iterable[str]):
    result = {}
    for line in lines:
        if line.startswith("#") or not "=" in line:
            continue
        (key, val) = line.split("=")
        result[key.strip()] = val.strip('\n "')
    return result

NameError: name 'Iterable' is not defined

## `extract_params_as_dict`

In [None]:
# | export


def extract_params_as_dict(params_file_path: Path):
    params = {}
    with open(params_file_path, "r") as params_file:
        params = _lines_to_dict(params_file.readlines())
    return params

In [None]:
params_dict = extract_params_as_dict(
    os.path.join(get_config().path("lib_path"), "test", "test_export_params.py")
)
tup = tuple(params_dict.keys())

## `params_as_dict`

In [None]:
# | export


def params_as_dict(nb_path: Path):
    params_code = extract_params(read_nb(nb_path))
    params = _lines_to_dict(StringIO(params_code).readlines())
    return params

In [None]:
assert ["input_path", "model_path", "some_param", "some_params"] == list(
    sorted(params_as_dict(test_nb).keys())
)

In [None]:
params_as_dict(test_nb)

{'some_params': 'len([1, 2, 3])',
 'some_param': 'test',
 'input_path': 'str(Path(".").resolve())',
 'model_path': 'str(Path(".").resolve().parent)'}

# Parameter Data-types & Metadata

## Supported data types

In [None]:
# | export

supported_parameters = (str, int, float)
supported_args = (Dict, List)
supported_conversion_args = (pd.Series, pd.DataFrame, np.ndarray, Path)
suppported_types = []
suppported_types.extend(supported_parameters)
suppported_types.extend(supported_args)
suppported_types.extend(supported_conversion_args)

## `ParamMeta` class

In [None]:
# | export


@dataclass
class ParamMeta:
    instance_type: type
    is_scalar: bool
    is_json_type: bool
    persist_type: str
    has_metaflow_param: bool
    has_sagemaker_param: bool

## `load_module`

In [None]:
# | export


def load_module(fully_qualified_module_name):
    root_path = str(get_config().path("root_path"))
    if not root_path in sys.path:
        sys.path.append(root_path)
    package = ".".join(fully_qualified_module_name.split(".")[:-1])
    fully_qualified_module_name.split(".")[-1]
    module = import_module(fully_qualified_module_name, package)
    return module

In [None]:
fully_qualified_module_name = "sciflow.test.test_data_handling"

In [None]:
root_path = str(get_config().path("root_path"))
if not root_path in sys.path:
    sys.path.append(root_path)
package = ".".join(fully_qualified_module_name.split(".")[:-1])
fully_qualified_module_name.split(".")[-1]

'test_data_handling'

In [None]:
assert load_module("sciflow.test.test_data_handling").int_param == 3
assert load_module("sciflow.test.test_data_handling").float_param == 1.1
assert type(load_module("sciflow.test.test_data_handling").series_param) == pd.Series
assert load_module("sciflow.test.test_multistep").traffic_percent == 1
assert load_module("sciflow.test.test_multistep_no_params").traffic_percent == 1
assert load_module("sciflow.test.test_export").some_param == "test"
assert load_module("sciflow.test.test_export").some_params == 3
assert load_module("sciflow.test.test_module").some_param == 2

## `extract_param_meta`

In [None]:
# | export


def extract_param_meta(module_name: str, params: Dict[str, Any]):
    module = load_module(module_name)
    param_vals = {}
    for k in params.keys():
        param_vals[k] = getattr(module, k)
    param_meta = {}

    for key, val in param_vals.items():
        if any([isinstance(val, t) for t in supported_parameters]):
            param_meta[key] = ParamMeta(
                instance_type=type(val),
                is_scalar=True,
                is_json_type=False,
                persist_type="pickle",
                has_metaflow_param=True,
                has_sagemaker_param=True,
            )
        elif any([isinstance(val, t) for t in supported_args]):
            param_meta[key] = ParamMeta(
                instance_type=type(val),
                is_scalar=False,
                is_json_type=True,
                persist_type="pickle",
                has_metaflow_param=True,
                has_sagemaker_param=False,
            )
        elif any([isinstance(val, t) for t in supported_conversion_args]):
            if isinstance(val, np.ndarray):
                param_meta[key] = ParamMeta(
                    instance_type=type(val),
                    is_scalar=False,
                    is_json_type=False,
                    persist_type="numpy",
                    has_metaflow_param=False,
                    has_sagemaker_param=False,
                )
            elif isinstance(val, pd.Series) or isinstance(val, pd.DataFrame):
                param_meta[key] = ParamMeta(
                    instance_type=type(val),
                    is_scalar=False,
                    is_json_type=False,
                    persist_type="pandas",
                    has_metaflow_param=False,
                    has_sagemaker_param=False,
                )
            elif isinstance(val, Path):
                param_meta[key] = ParamMeta(
                    instance_type=type(val),
                    is_scalar=False,
                    is_json_type=False,
                    persist_type="pickle",
                    has_metaflow_param=True,
                    has_sagemaker_param=False,
                )
        else:
            param_meta[key] = ParamMeta(
                instance_type=type(val),
                is_scalar=False,
                is_json_type=False,
                persist_type="unsupported",
                has_metaflow_param=False,
                has_sagemaker_param=False,
            )
    return param_meta

In [None]:
nb_path = os.path.join(Path(".").resolve(), "test", "test_data_handling.ipynb")
params = params_as_dict(nb_path)
param_meta = extract_param_meta("sciflow.test.test_data_handling", params)

In [None]:
from pathlib import PosixPath

In [None]:
assert all(
    [
        param_meta["int_param"].instance_type == int,
        param_meta["float_param"].instance_type == float,
        param_meta["str_param"].instance_type == str,
        param_meta["input_path"].instance_type == PosixPath,
        param_meta["model_path"].instance_type == PosixPath,
        param_meta["dict_param"].instance_type == dict,
        param_meta["list_param"].instance_type == list,
        param_meta["series_param"].instance_type == pd.Series,
        param_meta["df_param"].instance_type == pd.DataFrame,
    ]
)

In [None]:
assert param_meta["int_param"].has_metaflow_param
assert param_meta["int_param"].has_sagemaker_param
assert param_meta["str_param"].has_metaflow_param
assert param_meta["str_param"].has_sagemaker_param
assert param_meta["float_param"].has_metaflow_param
assert param_meta["float_param"].has_sagemaker_param

In [None]:
assert param_meta["list_param"].has_metaflow_param
assert not param_meta["list_param"].has_sagemaker_param
assert param_meta["dict_param"].has_metaflow_param
assert not param_meta["dict_param"].has_sagemaker_param
assert param_meta["input_path"].has_metaflow_param
assert not param_meta["input_path"].has_sagemaker_param

In [None]:
nb_path = Path(Path(".").resolve(), "test", "test_multistep.ipynb")
params = params_as_dict(nb_path)
param_meta = extract_param_meta("sciflow.test.test_multistep", params)
param_meta.keys()

dict_keys(['traffic_percent', 'workers', 'model_level', 'min_date'])