# Add a file for tracking

`dvc add test/test_dag.gml`

# Creates a .dvc file

`data.xml.dvc`

# Ignore this in .igitignore

* `git add data/data.xml.dvc data/.gitignore`
* `git commit -m "Add raw data"`

# Upload DVC tracked data

```
dvc remote add -d storage s3://bucket/dvcstore
git add .dvc/config
git commit -m "Configure remote storage"
```

# Push

`dvc push`

# Pull

`dvc pull`

In [31]:
# default_exp data_handler

In [32]:
# export

from dataclasses import dataclass
from importlib import import_module
from pathlib import Path
from typing import Any, Dict, List

import os
import numpy as np
import pandas as pd

from sciflow.params import params_as_dict

In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
#export

supported_parameters = (str, int, float)
supported_args = (Dict, List)
supported_conversion_args = (pd.Series, pd.DataFrame, np.ndarray, Path)
suppported_types = []
suppported_types.extend(supported_parameters)
suppported_types.extend(supported_args)
suppported_types.extend(supported_conversion_args)

In [35]:
#export

@dataclass
class Param:
    instance_type: type
    is_scalar: bool
    has_metaflow_param: bool
    is_json_type: bool
    persist_type: str

In [36]:
#export


def load_module(fully_qualified_module_name):
    package = ".".join(fully_qualified_module_name.split(".")[:-1])
    fully_qualified_module_name.split(".")[-1]
    module = import_module(fully_qualified_module_name, package)
    return module

In [37]:
assert load_module("sciflow.test.test_data_handling").int_param == 3
assert load_module("sciflow.test.test_data_handling").float_param == 1.1
assert type(load_module("sciflow.test.test_data_handling").series_param) == pd.Series
assert load_module("sciflow.utils").load_dremio_access is not None

In [38]:
#export


def extract_param_meta(module_name: str, params: Dict[str, Any]):
    module = load_module(module_name)
    param_vals = {}
    for k in params.keys():
        param_vals[k] = getattr(module, k)
    param_meta = {}

    for key, val in param_vals.items():
        if any([isinstance(val, t) for t in supported_parameters]):
            param_meta[key] = Param(
                instance_type=type(val),
                is_scalar=True,
                has_metaflow_param=True,
                is_json_type=False,
                persist_type="pickle",
            )
        elif any([isinstance(val, t) for t in supported_args]):
            param_meta[key] = Param(
                instance_type=type(val),
                is_scalar=False,
                has_metaflow_param=True,
                is_json_type=True,
                persist_type="pickle",
            )
        elif any([isinstance(val, t) for t in supported_conversion_args]):
            if isinstance(val, np.ndarray):
                param_meta[key] = Param(
                    instance_type=type(val),
                    is_scalar=False,
                    has_metaflow_param=False,
                    is_json_type=False,
                    persist_type="numpy",
                )
            elif isinstance(val, pd.Series) or isinstance(val, pd.DataFrame):
                param_meta[key] = Param(
                    instance_type=type(val),
                    is_scalar=False,
                    has_metaflow_param=False,
                    is_json_type=False,
                    persist_type="pandas",
                )
        else:
            print(f"Skipping unsupported param type, for: {key} - {type(val)}")
    return param_meta

In [39]:
nb_path = os.path.join(Path(".").resolve(), "test", "test_data_handling.ipynb")
params = params_as_dict(nb_path)

In [40]:
extract_param_meta("sciflow.test.test_data_handling", params)

{'int_param': Param(instance_type=<class 'int'>, is_scalar=True, has_metaflow_param=True, is_json_type=False, persist_type='pickle'),
 'float_param': Param(instance_type=<class 'float'>, is_scalar=True, has_metaflow_param=True, is_json_type=False, persist_type='pickle'),
 'str_param': Param(instance_type=<class 'str'>, is_scalar=True, has_metaflow_param=True, is_json_type=False, persist_type='pickle'),
 'dict_param': Param(instance_type=<class 'dict'>, is_scalar=False, has_metaflow_param=True, is_json_type=True, persist_type='pickle'),
 'list_param': Param(instance_type=<class 'list'>, is_scalar=False, has_metaflow_param=True, is_json_type=True, persist_type='pickle'),
 'ones': Param(instance_type=<class 'numpy.ndarray'>, is_scalar=False, has_metaflow_param=False, is_json_type=False, persist_type='numpy'),
 'text': Param(instance_type=<class 'numpy.ndarray'>, is_scalar=False, has_metaflow_param=False, is_json_type=False, persist_type='numpy'),
 'series_param': Param(instance_type=<clas