# Creating Feature Set

> Functions Used In Modeling Efforts

In [None]:
#| default_exp scripts.data_creation

In [None]:
#| export 
from fastcore.script import Param, call_parse

from data_system_utilities.snowflake.query import Snowflake
from data_system_utilities.azure.storage import FileHandling
from data_system_utilities.snowflake.copyinto import sf_to_adls_url_query_generator

from LTBP.data.utils import (
    query_feature_sets_to_adls_parquet_sf_fs, snowflake_query,
    get_yaml_dicts, pull_features_from_snowflake
    )
from LTBP import files

import os
import logging
import pandas as pd
import logging
import os

In [None]:
#| hide
from pathlib import Path
import yaml

In [None]:
#| hide
def write_yaml_file(file_path: str, file_name: str, dictionary: dict):
    with open(Path(file_path, file_name), 'w') as f:
        yaml.dump(dictionary, f)

etl_dict = dict({
 'azure_account': 'vaildtscadls',
 'azure_container': 'vailadls',
 'data_lake_path': 'projects/LTBP/FY23/',
 'max_file_size': '32000000',
 'over_write': 'True',
 'query_file_path': 'sql_files/',
 'stage_name': 'ltbp',
 'FY_folder' : 'FY23',
 'extra_statement' : {
    'TRAINING': None,
    'INFERENCE': None
 }
})

write_yaml_file('./LTBP/files/yaml_files/', 'etl.yaml', etl_dict)

`data_creation`

With the utility for nbdev and fastcore the below function becomes a command line argument that is created in the settings.ini **console_scripts**.

go to ./settings.ini find this section and add the scripts that you make.

```ini
console_scripts = data_creation=buypass.scripts.preprocess:data_creation
<name of command line arg> = <library name>.<path to function>.<file name>:<function name>
```

**What is happpening is this script**

Overview:

TODO: re-write

> Key Note: the train_or_test is the trigger for test/inference data set.

In [None]:
#| export


@call_parse
def data_creation(train_or_inference: Param(help="YAML section to read", type=str, default='TRAINING'), # noqa:
                  experiment_name: Param(help="YAML section to read", type=str, default='BASELINE'),  # noqa:
                  experiment: Param(help="YAML section to read", type=str, default='True')):  # noqa:
    """Creates a feature set for a experiment data set or a production level run feature set"""
    experiment = True if experiment.lower() == 'true' else False
    logging.info(f"This is a {'experiment run' if experiment else 'production run'}")
    logging.info('Loading Yaml Files..')
    features, udf_inputs, etl = get_yaml_dicts(['features.yaml', 'udf_inputs.yaml', 'etl.yaml'])
    logging.info('Generating Feature Set Query')
    query = pull_features_from_snowflake(feature_dict=features,
                                         udf_inputs=udf_inputs[train_or_inference.upper()],
                                         filepath_to_grain_list_query='./LTBP/files/sql_files/',
                                         experiment_name=experiment_name)
    data_lake_path = os.path.join((os.path.join(etl['data_lake_path'], 'experiments', experiment_name)
                      if experiment 
                      else os.path.join(etl['data_lake_path'], 
                                        os.environ.get('CI_COMMIT_SHA', 'LocalRunTest')))
                     , train_or_inference.lower()+'_data/')
    logging.info(f'Checking {data_lake_path} to either skip creation for experiment or create a production dataset')
    fh = FileHandling(os.environ['DATALAKE_CONN_STR_SECRET'])

    ald_files = fh.ls_blob(path=data_lake_path, container_name=etl['azure_container'])
    sf = snowflake_query()
    if ald_files == []:
        query_feature_sets_to_adls_parquet_sf_fs(
            sf_connection=sf,
            sf_query=query,
            query_file_path=os.path.join(files.__path__[0], etl['query_file_path']),
            azure_account=etl["azure_account"],
            azure_container=etl["azure_container"],
            data_lake_path=data_lake_path, # TODO: Think about experiments versus 
            partition_by=None,
            data_lake_sas_token=os.environ["DATALAKE_SAS_TOKEN_SECRET"],
        )
    else:
        logging.warning(f'{data_lake_path} already exists this should be do experimentation runs')

In [None]:
# | hide
"""
This is to help a user develop locally for the script if things are changing best method is to pull the above
cell into this one and begin to develop or you can do this is a .py file, but this is my prefered method
"""
experiment = 'True' # this will trigger if the feature set needs to be created
train_or_inference = 'TRAINING' # 'INFERENCE'
experiment_name='BASELINE'


experiment = True if experiment.lower() == 'true' else False
logging.info(f"This is a {'experiment run' if experiment else 'production run'}")
logging.info('Loading Yaml Files..')
features, udf_inputs, etl = get_yaml_dicts(['features.yaml', 'udf_inputs.yaml', 'etl.yaml'])
logging.info('Generating Feature Set Query')
query = pull_features_from_snowflake(feature_dict=features,
                                     udf_inputs=udf_inputs[train_or_inference.upper()],
                                     filepath_to_grain_list_query='./LTBP/files/sql_files/',
                                     experiment_name=experiment_name)
data_lake_path = os.path.join((os.path.join(etl['data_lake_path'], 'experiments', experiment_name)
                  if experiment 
                  else os.path.join(etl['data_lake_path'], 
                                    os.environ.get('CI_COMMIT_SHA', 'LocalRunTest')))
                 , train_or_inference.lower()+'_data/')
logging.info(f'Checking {data_lake_path} to either skip creation for experiment or create a production dataset')
fh = FileHandling(os.environ['DATALAKE_CONN_STR_SECRET'])

ald_files = fh.ls_blob(path=data_lake_path, container_name=etl['azure_container'])
sf = snowflake_query()
if ald_files == []:
    query_feature_sets_to_adls_parquet_sf_fs(
        sf_connection=sf,
        sf_query=query,
        query_file_path=os.path.join(files.__path__[0], etl['query_file_path']),
        azure_account=etl["azure_account"],
        azure_container=etl["azure_container"],
        data_lake_path=data_lake_path, # TODO: Think about experiments versus 
        partition_by=None,
        data_lake_sas_token=os.environ["DATALAKE_SAS_TOKEN_SECRET"],
    )
else:
    logging.warning(f'{data_lake_path} already exists this should be do experimentation runs')

INFO:root:This is a experiment run
INFO:root:Loading Yaml Files..
INFO:root:Generating Feature Set Query
INFO:root:static features in data set: 
 ['DESTINATIONGEOAFINITYLABEL', 'GenderCode']
INFO:root:temporal features in data set: 
 ['Age', 'AvgVisitPerSeason', 'BoughtPass', 'EverCorePass', 'EverPass', 'GuestBehavior', 'IsEpicMixActivated', 'MarketingZone', 'MostCommonTicketComp', 'MostSubSeasonVisited', 'MostVisitedRegion', 'MostVisitedResort', 'OnlySingleResortKey', 'PartnerResortScannerFlag', 'ResortsVisited', 'SkierabilityLabel', 'SubSeasonsPerYear', 'TotalSeasonsScanned', 'TotalVisits', 'VisitMostInPeak']
INFO:root:Appending static feature DESTINATIONGEOAFINITYLABEL to query
INFO:root:Appending static feature GenderCode to query
INFO:root:Finished appending static features
INFO:root:reading training_ecids_18_19.sql for base query...
INFO:root:reading training_ecids_19_20.sql for base query...
INFO:root:reading training_ecids_20_21.sql for base query...
INFO:root:final query outpu

INFO:root:Checking projects/LTBP/FY23/experiments/BASELINE/training_data/ to either skip creation for experiment or create a production dataset
INFO:data_system_utilities.azure.storage:number of files in container path recursively 0
INFO:data_system_utilities.snowflake.copyinto:
COPY INTO 'azure://vaildtscadls.blob.core.windows.net/vailadls/projects/LTBP/FY23/experiments/BASELINE/training_data/'
FROM (select
base.*
, joined.DESTINATIONGEOAFINITYLABEL
, joined.GenderCode
, MACHINELEARNINGFEATURES.PROD.Age_ECID_Temporal(base.ECID, 20171001, 20191005) as Age, MACHINELEARNINGFEATURES.PROD.AvgVisitPerSeason_ECID_Temporal(base.ECID, 20171001, 20191005) as AvgVisitPerSeason, MACHINELEARNINGFEATURES.PROD.BoughtPass_ECID_Temporal(base.ECID, '2019/20') as BoughtPass, MACHINELEARNINGFEATURES.PROD.EverCorePass_ECID_Temporal(base.ECID, 20151005, 20191005) as EverCorePass, MACHINELEARNINGFEATURES.PROD.EverPass_ECID_Temporal(base.ECID, 20151005, 20191005) as EverPass, MACHINELEARNINGFEATURES.PROD.Gue

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:root:data has been delivered from sf to adls


In [None]:
# | hide
from LTBP.data.utils import snowflake_query, get_yaml_dicts, generate_data_lake_query
from LTBP import files

from data_system_utilities.file_parsers import yaml
from data_system_utilities.snowflake.utils import make_stage_query_generator

#| hide
stage_url = f"""azure://{etl['azure_account']}.blob.core.windows.net/
{etl['azure_container']}/{etl['data_lake_path']}{
(os.path.join('experiments', experiment_name)
if experiment 
else os.path.join('LocalRunTest'))}""".replace('\n', '')
stage_query = make_stage_query_generator(
    stage_name=etl["stage_name"] + etl['FY_folder'] + os.environ.get('CI_COMMIT_SHA', 'LocalRunTest'),
    url=stage_url,
    sas_token=os.environ["DATALAKE_SAS_TOKEN_SECRET"],
    file_type="parquet",
)
_ = sf.run_sql_str(stage_query)
# TODO: Figure out a identification feature like season year 
# Udf grain is ECID, which is easy to get, but season year isn't obivous some thought is needed
indentification = [col.split('.')[-1] for col in udf_inputs[train_or_inference]['UDF_GRAIN']]
columns = [col.upper() for col in features.keys()]
query = generate_data_lake_query(stage_name=(etl["stage_name"] 
                                             + etl['FY_folder'] 
                                             + os.environ.get('CI_COMMIT_SHA', 'LocalRunTest')),
     stage_path=train_or_inference.lower()+'_data/',
     columns=indentification + columns,
     extra_statement=None)
logging.info(f'adls snowflake stage query {query}')
df = sf.run_sql_str(query)
logging.info(f'Preview dataframe queried {df.head()}')

INFO:data_system_utilities.snowflake.utils:stage_query: 
 create or replace stage ltbpFY23LocalRunTest
url='azure://vaildtscadls.blob.core.windows.net/vailadls/projects/LTBP/FY23/experiments/BASELINE'
credentials=(azure_sas_token='**MASKED**')
encryption=(type= 'NONE')
file_format = (type = parquet        )
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.snowflake.query:Stage area LTBPFY23LOCALRUNTEST successfully created.
INFO:root:adls snowflake stage query 
    select
    $1:"ECID"::varchar as ECID
, $1:"AGE"::varchar as AGE
, $1:"AVGVISITPERSEASON"::varchar as AVGVISITPERSEASON
, $1:"BOUGHTPASS"::varchar as BOUGHTPASS
, $1:"DESTINATIONGEOAFINITYLABEL"::varchar as DESTINATIONGEOAFINITYLABEL
, $1:"EVERCOREPASS":

In [None]:
# | hide
"""
This is to help a user develop locally for the script if things are changing best method is to pull the above
cell into this one and begin to develop or you can do this is a .py file, but this is my prefered method
"""
experiment = 'True' # this will trigger if the feature set needs to be created
train_or_inference ='INFERENCE'
experiment_name='BASELINE'


experiment = True if experiment.lower() == 'true' else False
logging.info(f"This is a {'experiment run' if experiment else 'production run'}")
logging.info('Loading Yaml Files..')
features, udf_inputs, etl = get_yaml_dicts(['features.yaml', 'udf_inputs.yaml', 'etl.yaml'])
logging.info('Generating Feature Set Query')
query = pull_features_from_snowflake(feature_dict=features,
                                     udf_inputs=udf_inputs[train_or_inference.upper()],
                                     filepath_to_grain_list_query='./LTBP/files/sql_files/',
                                     experiment_name=experiment_name)
data_lake_path = os.path.join((os.path.join(etl['data_lake_path'], 'experiments', experiment_name)
                  if experiment 
                  else os.path.join(etl['data_lake_path'], 
                                    os.environ.get('CI_COMMIT_SHA', 'LocalRunTest')))
                 , train_or_inference.lower()+'_data/')
logging.info(f'Checking {data_lake_path} to either skip creation for experiment or create a production dataset')
fh = FileHandling(os.environ['DATALAKE_CONN_STR_SECRET'])

ald_files = fh.ls_blob(path=data_lake_path, container_name=etl['azure_container'])
sf = snowflake_query()
if ald_files == []:
    query_feature_sets_to_adls_parquet_sf_fs(
        sf_connection=sf,
        sf_query=query,
        query_file_path=os.path.join(files.__path__[0], etl['query_file_path']),
        azure_account=etl["azure_account"],
        azure_container=etl["azure_container"],
        data_lake_path=data_lake_path, # TODO: Think about experiments versus 
        partition_by=None,
        data_lake_sas_token=os.environ["DATALAKE_SAS_TOKEN_SECRET"],
    )
else:
    logging.warning(f'{data_lake_path} already exists this should be do experimentation runs')

INFO:root:This is a experiment run
INFO:root:Loading Yaml Files..
INFO:root:Generating Feature Set Query
INFO:root:static features in data set: 
 ['DESTINATIONGEOAFINITYLABEL', 'GenderCode']
INFO:root:temporal features in data set: 
 ['Age', 'AvgVisitPerSeason', 'BoughtPass', 'EverCorePass', 'EverPass', 'GuestBehavior', 'IsEpicMixActivated', 'MarketingZone', 'MostCommonTicketComp', 'MostSubSeasonVisited', 'MostVisitedRegion', 'MostVisitedResort', 'OnlySingleResortKey', 'PartnerResortScannerFlag', 'ResortsVisited', 'SkierabilityLabel', 'SubSeasonsPerYear', 'TotalSeasonsScanned', 'TotalVisits', 'VisitMostInPeak']
INFO:root:Appending static feature DESTINATIONGEOAFINITYLABEL to query
INFO:root:Appending static feature GenderCode to query
INFO:root:Finished appending static features
INFO:root:reading inference_base.sql for base query...
INFO:root:final query output: 
 select
base.*
, joined.DESTINATIONGEOAFINITYLABEL
, joined.GenderCode
, MACHINELEARNINGFEATURES.PROD.Age_ECID_Temporal(base

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:root:data has been delivered from sf to adls


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()