# Creating Feature Set

> Functions Used In Modeling Efforts

In [None]:
#| default_exp scripts.data_creation

In [None]:
# export 
# from dsde.modeling.premodel import make_data_lake_stage
from fastcore.script import Param, call_parse
# from buypass.utils import snowflake_query, get_yaml_dicts, snowflake_to_data_lake_from_query, validate_sections
# from buypass.datacreation import generate_feature_set_query

import pandas as pd
import logging
import os


logging.basicConfig(level=logging.INFO)
logging.getLogger("azure.core").setLevel(logging.WARNING)
logging.getLogger("urllib3.connectionpool").setLevel(logging.CRITICAL)
logging.getLogger("snowflake.connector").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)

## data_creation

With the utility for nbdev and fastcore the below function becomes a command line argument that is created in the settings.ini **console_scripts**.

go to ./settings.ini find this section and add the scripts that you make.

```ini
console_scripts = data_creation=buypass.scripts.preprocess:data_creation
<name of command line arg> = <library name>.<path to function>.<file name>:<function name>
```

**What is happpening is this script**

Overview:

1. Reads yaml files needed to complete this stage ``get_yaml_dicts``

2. Generate query, feature names and feature types for DS model ``generate_feature_set_query``

3. Generates datalake stage ``make_data_lake_stage``

4. Executes generated query and dumps to the datalake stage that was created in step 3 ``snowflake_to_data_lake_from_query``

> Key Note: the ``train_or_test`` is the trigger for test/inference data set.

In [None]:
# yaml_file_list: Param(help="YAML file to read", type=list,  # noqa:
#                                         default=['dataset.yaml', 'etl.yaml', 'experiment.yaml']),  # noqa:

In [None]:
#| export
@call_parse
def data_creation(train_or_inference: Param(help="YAML section to read", type=str, default='train')):  # noqa:
    """

    What is happpening is this scriptOverview:
    1. Reads yaml files needed to complete this stage ``get_yaml_dicts``
    2. Generate query, feature names and feature types for DS model ``generate_feature_set_query``
    3. Generates datalake stage ``make_data_lake_stage``
    4. Executes generated query and dumps to the datalake stage that was created in step 3 ``snowflake_to_data_lake_from_query``

    Key Note: the train_or_test is the trigger for test/inference data set.
    """
    data, etl, _ = get_yaml_dicts(yaml_file_list)
    val_return = validate_sections(data, train_or_inference)
    output = generate_feature_set_query(ecid_list=val_return[0],
                                        static_features=val_return[1],
                                        temporal_feature_dict=val_return[2],
                                        resort_feature_dict=val_return[3],
                                        subseason_feature_dict=val_return[4],
                                        label_dict=val_return[5],
                                        years_of_data=val_return[6],
                                        time_dict=val_return[7],
                                        custom_feature_dict=val_return[8])
    full_query, feature_names, feature_types = output
    stage_name = etl['snowflake']['database']+'.'+etl[os.environ.get('prod_or_dev')]+'.'+etl['data_lake']['stage_name']+os.environ.get('CI_COMMIT_SHA')
    logger.info('Begining Creation of DataLake Stage')
    sf = snowflake_query(sfSchema=etl[os.environ.get('prod_or_dev')])
    make_data_lake_stage(sf_connection=sf,
                         stage_name=stage_name,
                         account=etl['data_lake']['account'],
                         container=etl['data_lake']['container'],
                         data_lake_path=etl['data_lake']['stage_path'].replace('COMMITID', os.environ['CI_COMMIT_SHA']),
                         sas_token=os.environ[etl['data_lake']['sas_token']])
    logger.info('Begining Query Dump from SF --> Azure Datalake')
    blob_path = etl['data_lake']['training_path'] if train_or_inference == 'train' else etl['data_lake']['inference_path']
    snowflake_to_data_lake_from_query(sf=sf,
                                      stage_name=stage_name,
                                      blob_path=blob_path,
                                      query=full_query)
    if train_or_inference == 'train':
        logger.info('pushing feature list to gitlab artifact to be used in the next section')
        pd.DataFrame([feature_names, feature_types]).T.rename(columns={0: 'featurenames', 1: 'dtypes'}).to_csv('features.csv', index=False)
    else:
        logger.info('pushing inference list to gitlab artifact to be used in the next section')
        pd.DataFrame([feature_names, feature_types]).T.rename(columns={0: 'featurenames', 1: 'dtypes'}).to_csv('inference_features.csv', index=False)

In [None]:
from LTBP.data.utils import get_yaml_dicts
from machine_learning_utilities.dataset_creation.snowflake import pull_features_from_snowflake
from data_system_utilities.snowflake.copyinto import sf_to_adls_url_query_generator

  warn_incompatible_dep(


INFO:root:static features in data set: 
 ['DESTINATIONGEOAFINITYLABEL', 'GenderCode']
INFO:root:temporal features in data set: 
 ['Age', 'AvgVisitPerSeason', 'BoughtPass', 'EverCorePass', 'EverPass', 'GuestBehavior', 'IsEpicMixActivated', 'MarketingZone', 'MostCommonTicketComp', 'MostSubSeasonVisited', 'MostVisitedRegion', 'MostVisitedResort', 'OnlySingleResortKey', 'PartnerResortScannerFlag', 'ResortsVisited', 'SkierabilityLabel', 'SubSeasonsPerYear', 'TotalSeasonsScanned', 'TotalVisits', 'VisitMostInPeak']
INFO:root:Appending static feature DESTINATIONGEOAFINITYLABEL to query
INFO:root:Appending static feature GenderCode to query
INFO:root:Finished appending static features
INFO:root:reading training_ecids_18_19.sql for base query...
INFO:root:reading training_ecids_19_20.sql for base query...
INFO:root:reading training_ecids_20_21.sql for base query...
INFO:machine_learning_utilities.dataset_creation.snowflake:final query output: 
 select
base.*
, joined.DESTINATIONGEOAFINITYLABEL
,

In [None]:
#| hide
from pathlib import Path
import yaml


def write_yaml_file(file_path: str, file_name: str, dictionary: dict):
    with open(Path(file_path, file_name), 'w') as f:
        yaml.dump(dictionary, f)

In [None]:
etl_dict = dict({
 'azure_account': 'vaildtscadls',
 'azure_container': 'vailadls',
 'data_lake_path': 'projects/LTBP/FY23/',
 'exp_name' : 'status_quo/',
 'max_file_size': '32000000',
 'over_write': 'True',
 'query_file_location': 'sql_files/',
 'stage_name': 'resortvisitation',
})

write_yaml_file('./LTBP/files/yaml_files/', 'etl.yaml', etl_dict)

In [None]:
from data_system_utilities.snowflake.query import Snowflake
from data_system_utilities.azure.storage import FileHandling
from data_system_utilities.snowflake.copyinto import sf_to_adls_url_query_generator

from LTBP import files
# from LTBP.data.utils import get_yaml_dicts, snowflake_query, query_feature_sets_to_adls_parquet_sf_fs

import os

In [None]:
#| skip
azure_account=data_dict['azure_account']
azure_container=data_dict['azure_container']
data_lake_path=data_dict['data_lake_path']
query_file_path=data_dict['query_file_location']
data_lake_sas_token=os.environ["DATALAKE_SAS_TOKEN_SECRET"]
partition_by= None
max_file_size= "3200000"
header= "True"
over_write= "True"
experiment = True # this will trigger if the feature set needs to be created
train_or_inference = 'TRAINING'


logging.info('Loading Yaml Files..')
features, udf_inputs, etl = get_yaml_dicts(['features.yaml', 'udf_inputs.yaml', 'etl.yaml'])
logging.info('Generating Feature Set Query')
query = pull_features_from_snowflake(feature_dict=features,
                                     udf_inputs=udf_inputs[train_or_inference.upper()],
                                     filepath_to_grain_list_query='./LTBP/files/sql_files/')

data_lake_path = (os.path.join(data_lake_path, 'experiments', data_dict['exp_name'])
                  if experiment 
                  else os.path.join(data_lake_path, 
                                    os.environ.get('CI_COMMIT_SHA', 'LocalRunNBS')))
logging.info(f'Checking {data_lake_path} to either skip creation for experiment or create a production dataset')

fh = FileHandling(os.environ['DATALAKE_CONN_STR_SECRET'])
ald_files = fh.ls_blob(path=data_lake_path, container_name=azure_container)

if ald_files == []:
    query_feature_sets_to_adls_parquet_sf_fs(
        sf_connection=snowflake_query(),
        sf_query=query,
        query_file_path=os.path.join(files.__path__[0], query_file_path),
        azure_account=data_dict["azure_account"],
        azure_container=data_dict["azure_container"],
        data_lake_path=data_lake_path, # TODO: Think about experiments versus 
        partition_by=None,
        data_lake_sas_token=os.environ["DATALAKE_SAS_TOKEN_SECRET"],
    )
else:
    logging.warning(f'{data_lake_path} already exists this should be do experimentation runs')

INFO:root:Loading Yaml Files..
INFO:root:Generating Feature Set Query
INFO:root:static features in data set: 
 ['DESTINATIONGEOAFINITYLABEL', 'GenderCode']
INFO:root:temporal features in data set: 
 ['Age', 'AvgVisitPerSeason', 'BoughtPass', 'EverCorePass', 'EverPass', 'GuestBehavior', 'IsEpicMixActivated', 'MarketingZone', 'MostCommonTicketComp', 'MostSubSeasonVisited', 'MostVisitedRegion', 'MostVisitedResort', 'OnlySingleResortKey', 'PartnerResortScannerFlag', 'ResortsVisited', 'SkierabilityLabel', 'SubSeasonsPerYear', 'TotalSeasonsScanned', 'TotalVisits', 'VisitMostInPeak']
INFO:root:Appending static feature DESTINATIONGEOAFINITYLABEL to query
INFO:root:Appending static feature GenderCode to query
INFO:root:Finished appending static features
INFO:root:reading training_ecids_18_19.sql for base query...
INFO:root:reading training_ecids_19_20.sql for base query...
INFO:root:reading training_ecids_20_21.sql for base query...
INFO:machine_learning_utilities.dataset_creation.snowflake:fin

INFO:root:Checking project/LTBP/FY23/experiments/status_quo to either skip creation for experiment or create a production dataset
INFO:data_system_utilities.azure.storage:number of files in container path recursively 0
INFO:data_system_utilities.snowflake.copyinto:
COPY INTO 'azure://vaildtscadls.blob.core.windows.net/vailadls/project/LTBP/FY23/experiments/status_quo'
FROM (select
base.*
, joined.DESTINATIONGEOAFINITYLABEL
, joined.GenderCode
, MACHINELEARNINGFEATURES.PROD.Age_ECID_Temporal(base.ECID, 20171001, 20191005) as Age, MACHINELEARNINGFEATURES.PROD.AvgVisitPerSeason_ECID_Temporal(base.ECID, 20171001, 20191005) as AvgVisitPerSeason, MACHINELEARNINGFEATURES.PROD.BoughtPass_ECID_Temporal(base.ECID, '2019/20') as BoughtPass, MACHINELEARNINGFEATURES.PROD.EverCorePass_ECID_Temporal(base.ECID, 20151005, 20191005) as EverCorePass, MACHINELEARNINGFEATURES.PROD.EverPass_ECID_Temporal(base.ECID, 20151005, 20191005) as EverPass, MACHINELEARNINGFEATURES.PROD.GuestBehavior_ECID_Temporal(bas

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


KeyboardInterrupt: 