# Add a file for tracking

`dvc add test/test_dag.gml`

# Creates a .dvc file

`data.xml.dvc`

# Ignore this in .igitignore

* `git add data/data.xml.dvc data/.gitignore`
* `git commit -m "Add raw data"`

# Upload DVC tracked data

```
dvc remote add -d storage s3://bucket/dvcstore
git add .dvc/config
git commit -m "Configure remote storage"
```

# Push

`dvc push`

# Pull

`dvc pull`

In [80]:
# export

from pathlib import Path
from typing import Dict, List, Any
import os
import numpy as np
import pandas as pd
from dataclasses import dataclass
from importlib import import_module

from nbdev.export import Config
from sciflow.params import params_as_dict, extract_params, read_nb

In [81]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [82]:
supported_parameters = (str, int, float)
supported_args = (Dict, List)
supported_conversion_args = (pd.Series, pd.DataFrame, np.ndarray, Path)
suppported_types = []
suppported_types.extend(supported_parameters)
suppported_types.extend(supported_args)
suppported_types.extend(supported_conversion_args)

In [83]:
@dataclass
class Param:
    is_scalar: bool
    has_metaflow_param: bool
    is_json_type: bool
    persist_type: str

In [113]:
def load_module(fully_qualified_module_name):
    package = '.'.join(fully_qualified_module_name.split('.')[:-1])
    module_name = fully_qualified_module_name.split('.')[-1]
    module = import_module(fully_qualified_module_name, package)
    return module

In [114]:
assert(load_module("sciflow.test.test_data_handling").int_param == 3)
assert(load_module("sciflow.test.test_data_handling").float_param == 1.1)
assert(type(load_module("sciflow.test.test_data_handling").series_param) == pd.Series)
assert(load_module("sciflow.utils").load_dremio_access is not None)

In [138]:
def extract_param_meta(module_name: str, params: Dict[str, Any]):
    module = load_module(module_name)
    param_vals = {}
    for k in params.keys():
        param_vals[k] = getattr(module, k)
    param_meta = {}
    
    for key, val in param_vals.items():
        if any([isinstance(val, t) for t in supported_parameters]):
            param_meta[key] = Param(is_scalar=True, has_metaflow_param=True, 
                                    is_json_type=False, persist_type='pickle')
        elif any([isinstance(val, t) for t in supported_args]):
            param_meta[key] = Param(is_scalar=False, has_metaflow_param=True, 
                                    is_json_type=True, persist_type='pickle')
        elif any([isinstance(val, t) for t in supported_conversion_args]):
            if isinstance(val, np.ndarray):
                param_meta[key] = Param(is_scalar=False, has_metaflow_param=False, 
                                    is_json_type=False, persist_type='numpy')
            elif isinstance(val, pd.Series) or isinstance(val, pd.DataFrame):
                param_meta[key] = Param(is_scalar=False, has_metaflow_param=False, 
                                    is_json_type=False, persist_type='pandas')
        else:
            print(f'Skipping unsupported param type, for: {key} - {type(val)}')   
    return param_meta

In [139]:
extract_param_meta(module_name, params)

{'int_param': Param(is_scalar=True, has_metaflow_param=True, is_json_type=False, persist_type='pickle'),
 'float_param': Param(is_scalar=True, has_metaflow_param=True, is_json_type=False, persist_type='pickle'),
 'str_param': Param(is_scalar=True, has_metaflow_param=True, is_json_type=False, persist_type='pickle'),
 'dict_param': Param(is_scalar=False, has_metaflow_param=True, is_json_type=True, persist_type='pickle'),
 'list_param': Param(is_scalar=False, has_metaflow_param=True, is_json_type=True, persist_type='pickle'),
 'ones': Param(is_scalar=False, has_metaflow_param=False, is_json_type=False, persist_type='numpy'),
 'text': Param(is_scalar=False, has_metaflow_param=False, is_json_type=False, persist_type='numpy'),
 'series_param': Param(is_scalar=False, has_metaflow_param=False, is_json_type=False, persist_type='pandas'),
 'df_param': Param(is_scalar=False, has_metaflow_param=False, is_json_type=False, persist_type='pandas')}

int_param <class 'int'>
float_param <class 'float'>
str_param <class 'str'>
input_path <class 'pathlib.PosixPath'>
model_path <class 'pathlib.PosixPath'>
dict_param <class 'dict'>
list_param <class 'list'>
ones <class 'numpy.ndarray'>
text <class 'numpy.ndarray'>
series_param <class 'pandas.core.series.Series'>
df_param <class 'pandas.core.frame.DataFrame'>


int