# Creating Feature Set

> Functions Used In Modeling Efforts

In [None]:
#| default_exp scripts.data_creation

In [None]:
#| export 
from fastcore.script import Param, call_parse

from data_system_utilities.snowflake.query import Snowflake
from data_system_utilities.azure.storage import FileHandling
from data_system_utilities.snowflake.copyinto import sf_to_adls_url_query_generator

from machine_learning_utilities.dataset_creation.snowflake import pull_features_from_snowflake

from LTBP.data.utils import query_feature_sets_to_adls_parquet_sf_fs, snowflake_query, get_yaml_dicts
from LTBP import files

import os
import logging
import pandas as pd
import logging
import os

In [None]:
#| hide
from pathlib import Path
import yaml

In [None]:
#| skip
def write_yaml_file(file_path: str, file_name: str, dictionary: dict):
    with open(Path(file_path, file_name), 'w') as f:
        yaml.dump(dictionary, f)

etl_dict = dict({
 'azure_account': 'vaildtscadls',
 'azure_container': 'vailadls',
 'data_lake_path': 'projects/LTBP/FY23/',
 'exp_name' : 'status_quo/',
 'max_file_size': '32000000',
 'over_write': 'True',
 'query_file_location': 'sql_files/',
 'stage_name': 'resortvisitation',
})

write_yaml_file('./LTBP/files/yaml_files/', 'etl.yaml', etl_dict)

`data_creation`

With the utility for nbdev and fastcore the below function becomes a command line argument that is created in the settings.ini **console_scripts**.

go to ./settings.ini find this section and add the scripts that you make.

```ini
console_scripts = data_creation=buypass.scripts.preprocess:data_creation
<name of command line arg> = <library name>.<path to function>.<file name>:<function name>
```

**What is happpening is this script**

Overview:

TODO: re-write

> Key Note: the train_or_test is the trigger for test/inference data set.

In [None]:
#| export


@call_parse
def data_creation(train_or_inference: Param(help="YAML section to read", type=str, default='TRAINING'), # noqa:
                  experiment: Param(help="YAML section to read", type=str, default='False')):  # noqa:
    """Creates a feature set for a experiment data set or a production level run feature set"""
    experiment = True if experiment == 'True' else False
    logging.info(f"This is a {'experiment run' if experiment else 'production run'}")
    logging.info('Loading Yaml Files..')
    features, udf_inputs, etl = get_yaml_dicts(['features.yaml', 'udf_inputs.yaml', 'etl.yaml'])
    logging.info('Generating Feature Set Query')
    query = pull_features_from_snowflake(feature_dict=features,
                                         udf_inputs=udf_inputs[train_or_inference.upper()],
                                         filepath_to_grain_list_query='./LTBP/files/sql_files/')
    data_lake_path = (os.path.join(etl['data_lake_path'], 'experiments', etl['exp_name'])
                      if experiment 
                      else os.path.join(etl['data_lake_path'], 
                                        os.environ.get('CI_COMMIT_SHA', 'LocalRunNBS')))
    logging.info(f'Checking {data_lake_path} to either skip creation for experiment or create a production dataset')
    fh = FileHandling(os.environ['DATALAKE_CONN_STR_SECRET'])
    ald_files = fh.ls_blob(path=data_lake_path, container_name=etl['azure_container'])
    sf = snowflake_query()
    if ald_files == []:
        query_feature_sets_to_adls_parquet_sf_fs(
            sf_connection=sf,
            sf_query=query,
            query_file_path=os.path.join(files.__path__[0], etl['query_file_path']),
            azure_account=etl["azure_account"],
            azure_container=etl["azure_container"],
            data_lake_path=data_lake_path, # TODO: Think about experiments versus 
            partition_by=None,
            data_lake_sas_token=os.environ["DATALAKE_SAS_TOKEN_SECRET"],
        )
    else:
        logging.warning(f'{data_lake_path} already exists this should be do experimentation runs')

In [None]:
#| hide
# experiment = True # this will trigger if the feature set needs to be created
# train_or_inference = 'TRAINING'


# logging.info('Loading Yaml Files..')
# features, udf_inputs, etl = get_yaml_dicts(['features.yaml', 'udf_inputs.yaml', 'etl.yaml'])
# logging.info('Generating Feature Set Query')
# query = pull_features_from_snowflake(feature_dict=features,
#                                      udf_inputs=udf_inputs[train_or_inference.upper()],
#                                      filepath_to_grain_list_query='./LTBP/files/sql_files/')

# data_lake_path = (os.path.join(etl['data_lake_path'], 'experiments', etl['exp_name'])
#                   if experiment 
#                   else os.path.join(etl['data_lake_path'], 
#                                     os.environ.get('CI_COMMIT_SHA', 'LocalRunNBS')))
# logging.info(f'Checking {data_lake_path} to either skip creation for experiment or create a production dataset')

# fh = FileHandling(os.environ['DATALAKE_CONN_STR_SECRET'])
# ald_files = fh.ls_blob(path=data_lake_path, container_name=etl['azure_container'])
# sf = snowflake_query()
# if ald_files == []:
#     query_feature_sets_to_adls_parquet_sf_fs(
#         sf_connection=sf,
#         sf_query=query,
#         query_file_path=os.path.join(files.__path__[0], query_file_path),
#         azure_account=etl["azure_account"],
#         azure_container=etl["azure_container"],
#         data_lake_path=data_lake_path, # TODO: Think about experiments versus 
#         partition_by=None,
#         data_lake_sas_token=os.environ["DATALAKE_SAS_TOKEN_SECRET"],
#     )
# else:
#     logging.warning(f'{data_lake_path} already exists this should be do experimentation runs')

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()