In [None]:
#| hide
from nbdev.showdoc import *

from pathlib import Path
import yaml

from LTBP.data.utils import snowflake_query

sf = snowflake_query()

  warn_incompatible_dep(


# Use Your CLI Commands

> This is how to do everything your gitlab pipeline will do locally using command line syntax

##  Yaml File Creations & Base Query For Feature Store Call

In [None]:
#| hide
def write_yaml_file(file_path: str, file_name: str, dictionary: dict):
    with open(Path(file_path, file_name), 'w') as f:
        yaml.dump(dictionary, f)


### Create Base Feature Store Call 

In [None]:
#| skip
# This model has been going for a long time we only use 3 years of data but this is a legacy project being moved over
season_year=['2017/18', '2018/19', '2019/20', '2020/21', '2021/22']
# Changing these to the season end years so that we don't get confused what year the data is for
seasons=['17_18', '18_19', '19_20', '20_21', '21_22']
for sy, s in zip(season_year, seasons):
    with open(Path('./LTBP/files/sql_files/', f'training_ecids_{s}.sql'), 'w') as f:
        f.write(f"""select
distinct fs.ECID as ecid,
dd.SeasonYear as SeasonYear
from BIDE_EDWDB_ARA_PROD.dbo.FactScan fs
left join BIDE_EDWDB_ARA_PROD.dbo.DimDateSeason dd
on dd.DateSeasonKey = fs.DateSeasonKey
where
dd.SeasonYear in ('{sy}')
and fs.IsEmployee = 0
and dd.Season = 'Winter'
LIMIT 10000
""")

### Model Features Yaml/Dictionary

This is described in depth inside of the Data_Preparation_Example.ipynb 

In [None]:
#| skip
feature_dict = {
    'GenderCode':
        {
        'feature_type': 'STATIC',
        'input_type': None,
        'input_definition': 'FEATURE',
        'udf_name': 'GenderCode',
        'var_type': {
            'BASELINE': 'cat' ,
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'DESTINATIONGEOAFINITYLABEL':
        {
        'feature_type': 'STATIC',
        'input_type': None,
        'input_definition': 'FEATURE',
        'udf_name': 'DESTINATIONGEOAFINITYLABEL',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
        # Issue with temporal element eventual switch back
        # glv issue spotted and snowpark fixes it
    'MarketingZone':
        {
        'feature_type': 'STATIC',
        'input_type': None,
        'input_definition': 'FEATURE',
        'udf_name': 'MarketingZone',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'Age':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'Age_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cont',
            'NOHYPEROPT': 'cont'
        },
        'transformation': {
            'BASELINE': 'StandardScaler',
            'NOHYPEROPT': 'StandardScaler'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'EverCorePass':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'EverCorePass_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'EverPass':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'EverPass_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'OnlySingleResortKey':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'OnlySingleResortKey_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'PartnerResortScannerFlag':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'PartnerResortScannerFlag_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'TotalSeasonsScanned':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'TotalSeasonsScanned_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'ResortsVisited':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'ResortsVisited_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cont',
            'NOHYPEROPT': 'cont'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'SubSeasonsPerYear':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'SubSeasonsPerYear_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cont',
            'NOHYPEROPT': 'cont'
        },
        'transformation': {
            'BASELINE': 'StandardScaler',
            'NOHYPEROPT': 'StandardScaler'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'MostCommonTicketComp':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'MostCommonTicketComp_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'AvgVisitPerSeason':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'AvgVisitPerSeason_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cont',
            'NOHYPEROPT': 'cont'
        },
        'transformation': {
            'BASELINE': 'StandardScaler',
            'NOHYPEROPT': 'StandardScaler'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'MostSubSeasonVisited':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'MostSubSeasonVisited_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'TotalVisits':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'TotalVisits_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cont',
            'NOHYPEROPT': 'cont'
        },
        'transformation': {
            'BASELINE': 'StandardScaler',
            'NOHYPEROPT': 'StandardScaler'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'VisitMostInPeak':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'VisitMostInPeak_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'MostVisitedRegion':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'MostVisitedRegion_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'MostVisitedResort':
        {
        'feature_type': 'TEMP',
        'input_type': '3_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'MostVisitedResort_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'IsEpicMixActivated':
        {
        'feature_type': 'TEMP',
        'input_type': 'SEASONYEAR',
        'input_definition': 'FEATURE',
        'udf_name': 'IsEpicMixActivated_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'MarketingZone':
        {
        'feature_type': 'TEMP',
        'input_type': 'SEASONYEAR',
        'input_definition': 'FEATURE',
        'udf_name': 'MarketingZone_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'SkierabilityLabel':
        {
        'feature_type': 'TEMP',
        'input_type': 'SEASONYEAR',
        'input_definition': 'FEATURE',
        'udf_name': 'SkierabilityLabel_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'GuestBehavior':
        {
        'feature_type': 'TEMP',
        'input_type': 'SEASONYEAR',
        'input_definition': 'FEATURE',
        'udf_name': 'GuestBehavior_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'EverCorePass':
        {
        'feature_type': 'TEMP',
        'input_type': 'ORIGIN_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'EverCorePass_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'EverPass':
        {
        'feature_type': 'TEMP',
        'input_type': 'ORIGIN_YEAR_DATE_RANGE',
        'input_definition': 'FEATURE',
        'udf_name': 'EverPass_ECID_Temporal',
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation': {
            'BASELINE': 'OrdinalEncoder',
            'NOHYPEROPT': 'OrdinalEncoder'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
    'BoughtPass': 
        {
        'feature_type': 'TEMP',
        'input_type': 'SEASONYEAR',
        'input_definition': 'LABEL',
        'udf_name': 'BoughtPass_ECID_Temporal', 
        'var_type': {
            'BASELINE': 'cat',
            'NOHYPEROPT': 'cat'
        },
        'transformation' : {
            'BASELINE': None,
            'NOHYPEROPT': 'None'
        },
        'experiment_list': ['BASELINE', 'NOHYPEROPT']
        },
}

write_yaml_file('./LTBP/files/yaml_files/', 'features.yaml', feature_dict)

### UDF Inputs Yaml/Dictionary

This is described in depth inside of the Data_Preparation_Example.ipynb 

In [None]:
#| skip
udf_inputs = {
    'TRAINING': {
        # TODO: PR to remove this duplication
        'UDF_GRAIN' : ['base.ECID'], # input key to all udfs
        'FEATURE':{
            'BASELINE' : {
                'SEASON_END':{
                    0 : '20181005',
                    1 : '20191005',
                    2 : '20201005',
                },
                '3_YEAR_DATE_RANGE':{
                    0 : ['20171001', '20191005'],
                    1 : ['20181001', '20201005'],
                    2 : ['20191001', '20211005'],
                },
            'SEASONYEAR':{
                    0 :  "\'2017/18\'",
                    1 :  "\'2018/19\'",
                    2 :  "\'2019/20\'",
                },
                'ORIGIN_YEAR_DATE_RANGE':{
                    0 : ['20151005', '20191005'],
                    1 : ['20151005', '20201005'],
                    2 : ['20151005', '20211005'],
                },
            },
        'NOHYPEROPT' : {
                'SEASON_END':{
                    0 : '20181005',
                    1 : '20191005',
                    2 : '20201005',
                },
                '3_YEAR_DATE_RANGE':{
                    0 : ['20171001', '20191005'],
                    1 : ['20181001', '20201005'],
                    2 : ['20191001', '20211005'],
                },
            'SEASONYEAR':{
                    0 :  "\'2017/18\'",
                    1 :  "\'2018/19\'",
                    2 :  "\'2019/20\'",
                },
                'ORIGIN_YEAR_DATE_RANGE':{
                    0 : ['20151005', '20191005'],
                    1 : ['20151005', '20201005'],
                    2 : ['20151005', '20211005'],
                },
            },
        },
        'LABEL' : {
            'BASELINE' : {
                 'SEASONYEAR': {
                    0 :  "\'2019/20\'",
                    1 :  "\'2020/21\'",
                    2 :  "\'2021/22\'",
                    }
            },
            'NOHYPEROPT' : {
                 'SEASONYEAR': {
                    0 :  "\'2019/20\'",
                    1 :  "\'2020/21\'",
                    2 :  "\'2021/22\'",
                    }
            },
        },
        'BASE_QUERY': {
            'BASELINE' : {
                0 : 'training_ecids_18_19.sql',
                1 : 'training_ecids_19_20.sql',
                2 : 'training_ecids_20_21.sql',
                },
            'NOHYPEROPT' : {
                0 : 'training_ecids_18_19.sql',
                1 : 'training_ecids_19_20.sql',
                2 : 'training_ecids_20_21.sql',
                },
        },
        'ADDITIONAL_COLUMNS': {
            'SEASONYEAR': {
                0: "'2018/19'",
                1: "'2019/20'",
                2: "'2020/21'"
            },
        }
    },
    'INFERENCE': {
     'UDF_GRAIN' : ['base.ECID'], # input key to all udfs
     'FEATURE':{
         'BASELINE' : { 
             'SEASON_END': {    
                     0 : '20221005',
                },
            '3_YEAR_DATE_RANGE': {
                     0 : ['20190101', '20221005'],
                },
            'SEASONYEAR': {
                    0 :  "\'2021/22\'",
                },
            'ORIGIN_YEAR_DATE_RANGE': { 
                    0 : ['20051001', '20221005'],
                },
            },
        'NOHYPEROPT' : { 
             'SEASON_END': {    
                     0 : '20221005',
                },
            '3_YEAR_DATE_RANGE': {
                     0 : ['20190101', '20221005'],
                },
            'SEASONYEAR': {
                    0 :  "\'2021/22\'",
                },
            'ORIGIN_YEAR_DATE_RANGE': { 
                    0 : ['20051001', '20221005'],
                },
            },
         },
    'LABEL' : {
        'BASELINE' : {
            'SEASONYEAR' : {
                0 :  "\'2021/22\'",
            }
        },
        'NOHYPEROPT' : {
            'SEASONYEAR' : {
                0 :  "\'2021/22\'",
            }
        },
    },
    'BASE_QUERY': {
        'BASELINE' : {
            0 : 'inference_base.sql'
            },
        'NOHYPEROPT' : {
            0 : 'inference_base.sql'
            }
        },
    'ADDITIONAL_COLUMNS': {
            'SEASONYEAR': {
                0: "'2021/22'",
            },
        }
    }
}

write_yaml_file('./LTBP/files/yaml_files/', 'udf_inputs.yaml', udf_inputs)

### ETL Yaml/Dictionary

In [None]:

etl = dict({
 'azure_account': 'vaildtscadls',
 'azure_container': 'vailadls',
 'data_lake_path': 'projects/LTBP/FY23/',
 'max_file_size': '32000000',
 'over_write': 'True',
 'query_file_path': 'sql_files/',
 'stage_name': 'ltbp',
 'FY_folder' : 'FY23',
 'extra_statement' : {
    'TRAINING': None,
    'INFERENCE': None
 }
})

write_yaml_file('./LTBP/files/yaml_files/', 'etl.yaml', etl)

### Model Yaml/Dictionary

**Global Use**

- preprocessors_adls_path
    
    - value: 'preprocessors/'
    
    - purpose: this is where in adls your preprocessors will be sent with an additional level using the experiment model_trainer (i.e. preprocessor/train_xgb/)
    
- modeling_adls_path
    
    - value: 'modeling/'
    
    - purpose: this is where in adls your models will be sent with an additional level using the experiment model_trainer (i.e. modeling/train_xgb/)
    
- predictions_adls_path
    
    - value: 'predictions/'
    
    - purpose: this is where in adls your model prediction file will be sent with an additional level using the experiment model_trainer (i.e. modeling/train_xgb/)
    
- connection_str
    
    - value: 'DATALAKE_CONN_STR_SECRET'
    
    - purpose: n/a
    
- sas_token
    
    - value: 'DATALAKE_SAS_TOKEN_SECRET'
    
    - purpose: n/a
    
- hold_out_table
    
    - value: 'LTBP_HOLDOUT_TEST_MODEL_RESULTS'
    
    - purpose: this example uses a test set, but if you don't have one then the validation set will be sent to this location with the identification columns with your custom `send_holdout_results_to_sf`
    
- tracking_table
    
    - value: 'LTBP_MODEL_TRACKING_FY23'
    
    - purpose: this is the name of the project log from msu will send your information for this run.
    
- identification
    
    - value: ['ECID', 'SEASONYEAR']
    
    - purpose: these columns will make it to your hold out set as well as your inference table to help with identification to a correct lable.
    
- inference_sf_table_name
    
    - value: 'LTBP_PREDICTIONS_FY23'
    
    - purpose: the inference table name that will be created

**Inside Experiment**

As of right now to keep this going it expect that you have the hyper opt options there even if you don't use it in your current model you can see that in this example they are set to None for things that aren't relevant to your current model. This process will be improved upon as we iterate.

- model_trainer: is the function inside of models.py that is going to be used for inference

In [None]:
#| skip
models_dict = dict({
    'preprocessors_adls_path' : 'preprocessors/',
    'modeling_adls_path' : 'modeling/',
    'predictions_adls_path': 'predictions/',
    'connection_str': 'DATALAKE_CONN_STR_SECRET',
    'sas_token' : 'DATALAKE_SAS_TOKEN_SECRET',
    'hold_out_table' : 'LTBP_HOLDOUT_TEST_MODEL_RESULTS',
    'tracking_table' : 'LTBP_MODEL_TRACKING_FY23',
    'identification': ['ECID', 'SEASONYEAR'],
    'inference_sf_table_name': 'LTBP_PREDICTIONS_FY23',
    'BASELINE': {
        'description': 'Standard baseline xgb_hyperopt approach status quo of LTBP of the past',
        'model_trainer': 'train_xgb',
        'y_preprocess_object_name': None,
        'y_scaler_type' : None,
        'x_preprocess_object_name': 'standard_pipe.pickle',
        'hyperopt_evals' : 2,
        'hyper_opt_subsample_size': 2750000,
        'training_subsample_size' : 5000000,

    },
    'NOHYPEROPT': {
        'description': 'Only here to see if it works delete at some point xgb_fit_only',
        'model_trainer': 'train_xgb_basic',
        'y_preprocess_object_name': None,
        'y_scaler_type' : None,
        'x_preprocess_object_name': 'standard_pipe.pickle',
        'hyper_opt_subsample_size' : '',
        'hyperopt_evals' : '',
        'training_subsample_size' : 5000000,
    }
})

write_yaml_file('./LTBP/files/yaml_files/', 'models.yaml', models_dict)

### Result

The project yamls files are created and sent to the location below

In [None]:
#| skip
! ls ./LTBP/files/yaml_files/

etl.yaml        features.yaml   models.yaml     udf_inputs.yaml


In [None]:
#| skip
! ls ./LTBP/files/sql_files/

inference_base.sql       training_ecids_18_19.sql training_ecids_20_21.sql
training_ecids_17_18.sql training_ecids_19_20.sql training_ecids_21_22.sql


## Data Set Creation

Lets grab a training set for the *BASELINE* & *NOHYPEROPT* experiment this is where you might want to build functionality to allow the project to only create a new data source if the data is different, but that was a little more work than what was taken on here, but very much a possibility

In [None]:
#| skip
! data_creation  --train_or_inference "TRAINING" --experiment_name "BASELINE" --experiment 

  warn_incompatible_dep(
INFO:root:This is a experiment run
INFO:root:Loading Yaml Files..
INFO:root:Generating Feature Set Query
INFO:root:static features in data set: 
 ['DESTINATIONGEOAFINITYLABEL', 'GenderCode']
INFO:root:temporal features in data set: 
 ['Age', 'AvgVisitPerSeason', 'BoughtPass', 'EverCorePass', 'EverPass', 'GuestBehavior', 'IsEpicMixActivated', 'MarketingZone', 'MostCommonTicketComp', 'MostSubSeasonVisited', 'MostVisitedRegion', 'MostVisitedResort', 'OnlySingleResortKey', 'PartnerResortScannerFlag', 'ResortsVisited', 'SkierabilityLabel', 'SubSeasonsPerYear', 'TotalSeasonsScanned', 'TotalVisits', 'VisitMostInPeak']
INFO:root:Appending static feature DESTINATIONGEOAFINITYLABEL to query
INFO:root:Appending static feature GenderCode to query
INFO:root:Finished appending static features
INFO:root:reading training_ecids_18_19.sql for base query...
INFO:root:reading training_ecids_19_20.sql for base query...
INFO:root:reading training_ecids_20_21.sql for base query...
IN

INFO:data_system_utilities.azure.storage:number of files in container path recursively 0
INFO:data_system_utilities.snowflake.copyinto:
COPY INTO 'azure://vaildtscadls.blob.core.windows.net/vailadls/projects/LTBP/FY23/experiments/NOHYPEROPT/training_data/'
FROM (select
base.*
, joined.DESTINATIONGEOAFINITYLABEL
, joined.GenderCode
, MACHINELEARNINGFEATURES.PROD.Age_ECID_Temporal(base.ECID, 20171001, 20191005) as Age, MACHINELEARNINGFEATURES.PROD.AvgVisitPerSeason_ECID_Temporal(base.ECID, 20171001, 20191005) as AvgVisitPerSeason, MACHINELEARNINGFEATURES.PROD.BoughtPass_ECID_Temporal(base.ECID, '2019/20') as BoughtPass, MACHINELEARNINGFEATURES.PROD.EverCorePass_ECID_Temporal(base.ECID, 20151005, 20191005) as EverCorePass, MACHINELEARNINGFEATURES.PROD.EverPass_ECID_Temporal(base.ECID, 20151005, 20191005) as EverPass, MACHINELEARNINGFEATURES.PROD.GuestBehavior_ECID_Temporal(base.ECID, '2017/18') as GuestBehavior, MACHINELEARNINGFEATURES.PROD.IsEpicMixActivated_ECID_Temporal(base.ECI

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:root:data has been delivered from sf to adls


In [None]:
#| skip
! data_creation  --train_or_inference "TRAINING" --experiment_name "NOHYPEROPT" --experiment 

Lets now grab a **inference/test set** and place it into adls for the batch inference process

In [None]:
#| skip
! data_creation  --train_or_inference 'INFERENCE' --experiment_name 'BASELINE' --experiment 

  warn_incompatible_dep(
INFO:root:This is a experiment run
INFO:root:Loading Yaml Files..
INFO:root:Generating Feature Set Query
INFO:root:static features in data set: 
 ['DESTINATIONGEOAFINITYLABEL', 'GenderCode']
INFO:root:temporal features in data set: 
 ['Age', 'AvgVisitPerSeason', 'BoughtPass', 'EverCorePass', 'EverPass', 'GuestBehavior', 'IsEpicMixActivated', 'MarketingZone', 'MostCommonTicketComp', 'MostSubSeasonVisited', 'MostVisitedRegion', 'MostVisitedResort', 'OnlySingleResortKey', 'PartnerResortScannerFlag', 'ResortsVisited', 'SkierabilityLabel', 'SubSeasonsPerYear', 'TotalSeasonsScanned', 'TotalVisits', 'VisitMostInPeak']
INFO:root:Appending static feature DESTINATIONGEOAFINITYLABEL to query
INFO:root:Appending static feature GenderCode to query
INFO:root:Finished appending static features
INFO:root:reading inference_base.sql for base query...
INFO:root:final query output: 
 select
base.*
, joined.DESTINATIONGEOAFINITYLABEL
, joined.GenderCode
, MACHINELEARNINGFEATURES.PR

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:root:data has been delivered from sf to adls


In [None]:
#| skip
! data_creation  --train_or_inference 'INFERENCE' --experiment_name 'NOHYPEROPT' --experiment 

### Results

Inside of the specified adls subscription described in the yaml files these first two steps create your data for your experiment or model run.

- projects/LTBP/FY23/experiments/BASELINE/training_data/

- projects/LTBP/FY23/experiments/BASELINE/inference_data/

> Note: experiments can be changed with CI_COMMIT_SHA depending on the methods approach that is being ran

##  Train Model

In [None]:
#| skip
! model_train --experiment_name 'BASELINE' --sfSchema 'dev' --test_set --experiment 

  warn_incompatible_dep(
INFO:data_system_utilities.snowflake.utils:stage_query: 
 create or replace stage ltbpFY23LocalRunTest
url='azure://vaildtscadls.blob.core.windows.net/vailadls/projects/LTBP/FY23/experiments/BASELINE'
credentials=(azure_sas_token='**MASKED**')
encryption=(type= 'NONE')
file_format = (type = parquet        )
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.snowflake.query:Stage area LTBPFY23LOCALRUNTEST successfully created.
INFO:root:adls snowflake stage query 
    select
    $1:"ECID"::varchar as ECID
, $1:"SEASONYEAR"::varchar as SEASONYEAR
, $1:"AGE"::varchar as AGE
, $1:"AVGVISITPERSEASON"::varchar as AVGVISITPERSEASON
, $1:"BOUGHTPASS"::varchar as BOUGHTPASS
, $1:"DESTINATIONGEOAFINITY

INFO:machine_learning_utilities.preprocessing:Preprocessing Pipeline Object:
Pipeline(steps=[('preprocessing',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('functiontransformer',
                                                                  FunctionTransformer(func=<function get_cat_cols>,
                                                                                      kw_args={'cols': ['DESTINATIONGEOAFINITYLABEL']})),
                                                                 ('ordinalencoder',
                                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                                 unknown_value=-1))])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('...
                                            

INFO:data_system_utilities.azure.storage:Azure Upload Complete
INFO:root:train_xgbLocalRunTestBASELINE.pkl successfully pushed to projects/LTBP/FY23/experiments/BASELINE/modeling/train_xgb


In [None]:
#| skip
! model_train --experiment_name 'NOHYPEROPT' --sfSchema 'dev' --test_set --experiment 

  warn_incompatible_dep(
INFO:data_system_utilities.snowflake.utils:stage_query: 
 create or replace stage ltbpFY23LocalRunTest
url='azure://vaildtscadls.blob.core.windows.net/vailadls/projects/LTBP/FY23/experiments/NOHYPEROPT'
credentials=(azure_sas_token='**MASKED**')
encryption=(type= 'NONE')
file_format = (type = parquet        )
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.snowflake.query:Stage area LTBPFY23LOCALRUNTEST successfully created.
INFO:root:adls snowflake stage query 
    select
    $1:"ECID"::varchar as ECID
, $1:"SEASONYEAR"::varchar as SEASONYEAR
, $1:"AGE"::varchar as AGE
, $1:"AVGVISITPERSEASON"::varchar as AVGVISITPERSEASON
, $1:"BOUGHTPASS"::varchar as BOUGHTPASS
, $1:"DESTINATIONGEOAFINI

INFO:machine_learning_utilities.preprocessing:Preprocessing Pipeline Object:
Pipeline(steps=[('preprocessing',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('functiontransformer',
                                                                  FunctionTransformer(func=<function get_cat_cols>,
                                                                                      kw_args={'cols': ['DESTINATIONGEOAFINITYLABEL']})),
                                                                 ('ordinalencoder',
                                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                                 unknown_value=-1))])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('...
                                            

INFO:data_system_utilities.snowflake.query:table created
INFO:root:project log preview:
                                  ACTION_DESCRIPTION  ... EVER_PRODUCTION
0  Only here to see if it works delete at some po...  ...           False

[1 rows x 12 columns]
INFO:root:project log values preview:
['Only here to see if it works delete at some point xgb_fit_only'
 'model_training' 'LocalRunTest' 'dev' 'random' '2022-11-03 17:54:17'
 '{"azure_parent_folder": "projects/LTBP/FY23/experiments/NOHYPEROPT"}'
 '{"training_metrics": {"auc": 0.9233503051910592, "acc": 0.8459166666666667, "bacc": 0.8470889725812882}, "fi_train": {"EVERPASS": 0.27579999999999993, "MOSTCOMMONTICKETCOMP": 0.03520000000000001, "SUBSEASONSPERYEAR": 0.02079999999999993, "TOTALVISITS": 0.014599999999999946, "GUESTBEHAVIOR": 0.014399999999999968, "AGE": 0.0129999999999999, "DESTINATIONGEOAFINITYLABEL": 0.010000000000000009, "ISEPICMIXACTIVATED": 0.006799999999999917, "AVGVISITPERSEASON": 0.006399999999999961, "MOSTSUBSEASO

### Results

**ADLS** 

Created in this call are as follows:

1. x and y preprocessor

    - projects/LTBP/FY23/experiments/BASELINE/preprocessors/train_xgb/

2. Sklearn Pipeline Model

    - projects/LTBP/FY23/experiments/BASELINE/modeling/train_xgb/

3. Hold out results for this model in this experiment

    - projects/LTBP/FY23/experiments/BASELINE/holdout_results/train_xgb/

**Snowflake**

A project log for the current run this is a new standard approach that for model promotions needs a quick PR to allow for models being put into production. Similar to how the legacy project currently support something like PROD_MODEL and have it be a BOOL type if True this is what the schedule of this repo uses to go find the production model in adls. We might also want to have something like EVER_PROD_MODEL to allow a user to know what models in the project log made it to production

> As of Nov 2nd this template doesn't support the inference only runs, but it will soon

In [None]:
#| skip
sf.run_sql_str('''SELECT * 
FROM MACHINELEARNINGOUTPUTS.LTBP.LTBP_MODEL_TRACKING_FY23
ORDER BY TIMESTAMP DESC
LIMIT 1
''')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,ACTION_DESCRIPTION,TRANSACTION_TYPE,COMMITID,ENVIRONMENT,BRANCH,TIMESTAMP,ARTIFACTS,METRICS,EXPERIMENT_NAME,EXPERIMENT,PRODUCTION_MODEL,EVER_PRODUCTION
0,Only here to see if it works delete at some po...,model_training,LocalRunTest,dev,random,2022-11-03 17:54:17,"{""azure_parent_folder"": ""projects/LTBP/FY23/ex...","{""training_metrics"": {""auc"": 0.923350305191059...",NOHYPEROPT,True,False,False


A hold out test set that gets sent to snowflake for post analysis work if a data scientist wants to look at where the model is doing well or even how well does this model extrapolate over season years for example in LTBP's case

In [None]:
#| skip
sf.run_sql_str('''SELECT * 
FROM MACHINELEARNINGOUTPUTS.DEV.LTBP_HOLDOUT_TEST_MODEL_RESULTS
LIMIT 5
''')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,ECID,SEASONYEAR,PROBABILITY,DATECREATED,EXP_COMMIT_CI_SHA
0,25115976,2019/20,0.386076,2022-11-03 11:11:13,BASELINE_LocalRunTest
1,5198697,2018/19,0.392566,2022-11-03 11:11:13,BASELINE_LocalRunTest
2,86694567,2018/19,0.385913,2022-11-03 11:11:13,BASELINE_LocalRunTest
3,17645838,2018/19,0.395433,2022-11-03 11:11:13,BASELINE_LocalRunTest
4,11271320,2018/19,0.39433,2022-11-03 11:11:13,BASELINE_LocalRunTest


## Infer Model Results

In [None]:
#| skip
! model_inference --sfSchema 'dev' --experiment_name 'BASELINE' --experiment 

  warn_incompatible_dep(
INFO:data_system_utilities.snowflake.utils:stage_query: 
 create or replace stage ltbpFY23LocalRunTest
url='azure://vaildtscadls.blob.core.windows.net/vailadls/projects/LTBP/FY23/experiments/BASELINE'
credentials=(azure_sas_token='**MASKED**')
encryption=(type= 'NONE')
file_format = (type = parquet        )
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.snowflake.query:Stage area LTBPFY23LOCALRUNTEST successfully created.
INFO:root:adls snowflake stage query 
    select
    $1:"ECID"::varchar as ECID
, $1:"SEASONYEAR"::varchar as SEASONYEAR
, $1:"AGE"::varchar as AGE
, $1:"AVGVISITPERSEASON"::varchar as AVGVISITPERSEASON
, $1:"BOUGHTPASS"::varchar as BOUGHTPASS
, $1:"DESTINATIONGEOAFINITY

In [None]:
#| skip
! model_inference --sfSchema 'dev' --experiment_name 'NOHYPEROPT' --experiment 

  warn_incompatible_dep(
INFO:data_system_utilities.snowflake.utils:stage_query: 
 create or replace stage ltbpFY23LocalRunTest
url='azure://vaildtscadls.blob.core.windows.net/vailadls/projects/LTBP/FY23/experiments/NOHYPEROPT'
credentials=(azure_sas_token='**MASKED**')
encryption=(type= 'NONE')
file_format = (type = parquet        )
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.snowflake.query:Stage area LTBPFY23LOCALRUNTEST successfully created.
INFO:root:adls snowflake stage query 
    select
    $1:"ECID"::varchar as ECID
, $1:"SEASONYEAR"::varchar as SEASONYEAR
, $1:"AGE"::varchar as AGE
, $1:"AVGVISITPERSEASON"::varchar as AVGVISITPERSEASON
, $1:"BOUGHTPASS"::varchar as BOUGHTPASS
, $1:"DESTINATIONGEOAFINI

### Results

**ADLS**

A .csv file is sent to the following location and the reason that it's in a .csv file is that it isn't huge and is only sent to snowflake once and it's sometimes nice to be able to pull down the file and look into it and parquet files don't allow you to do that, but completely open to changing this process

- projects/LTBP/FY23/experiments/BASELINE/predictions/train_xgb/

**Snowflake**

The an inference table is set up and at the current moment we are appending to the inference table this could change shortly, but for right now your inference table can be unique with your DATE_CREATED + EXPERIMENT. During iteration of this quick process understanding what is desired is part of this and there can be value in having the last runs predictions in the database. 

Looking for input on this


> Note: For the production run version creating an archive of all the runs that occur for a production level run seems like a pretty good way to save ourselves trouble, but again all the predictions will always be recoverable inside of adls.

In [None]:
#| skip
sf.run_sql_str('''SELECT * 
FROM MACHINELEARNINGOUTPUTS.DEV.LTBP_PREDICTIONS_FY23
ORDER BY DATE_CREATED DESC
LIMIT 5
''')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,ECID,SEASONYEAR,PROBABILITY,CI_COMMIT_SHA,DATE_CREATED,EXPERIMENT
0,7505200,2021/22,0.049381,LocalRunTest,2022-11-03 16:55:02,NOHYPEROPT
1,57254352,2021/22,0.020158,LocalRunTest,2022-11-03 16:55:02,NOHYPEROPT
2,3396514,2021/22,0.032057,LocalRunTest,2022-11-03 16:55:02,NOHYPEROPT
3,37310378,2021/22,0.018215,LocalRunTest,2022-11-03 16:55:02,NOHYPEROPT
4,60713843,2021/22,0.096196,LocalRunTest,2022-11-03 16:55:02,NOHYPEROPT


In [None]:
#| skip
sf.run_sql_str('''SELECT DISTINCT DATE_CREATED, EXPERIMENT
FROM MACHINELEARNINGOUTPUTS.DEV.LTBP_PREDICTIONS_FY23
LIMIT 5
''')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,DATE_CREATED,EXPERIMENT
0,2022-11-03 16:54:50,BASELINE
1,2022-11-03 16:55:02,NOHYPEROPT


## Model Promotion

Lets say this experiment is the model that you are so happy with and you didn't want to run it in a gitlab/github pipeline to get the results you can always promote a model to production because schedule inference runs will only be pulling inference data and running the model inference and post analysis that is custom to your project


**Why?**

So the question you might be asking is why wouldn't I just promote the model that I just ran to prod or make a copy of it in a "production schema" well that could happen but we like to be module in the approach we take as well as you need to be very intentional to overwrite the results in prod.

The long story short is that it forces you to think about the promotion of a model before letting it happen. It is manual, but it is manual for the reason to slow you down


**How**

In [None]:
#| skip
sf.run_sql_str('''SELECT * 
FROM MACHINELEARNINGOUTPUTS.LTBP.LTBP_MODEL_TRACKING_FY23
''')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,ACTION_DESCRIPTION,TRANSACTION_TYPE,COMMITID,ENVIRONMENT,BRANCH,TIMESTAMP,ARTIFACTS,METRICS,EXPERIMENT_NAME,EXPERIMENT,PRODUCTION_MODEL,EVER_PRODUCTION
0,Standard baseline xgb_hyperopt approach status...,model_training,LocalRunTest,dev,random,2022-11-03 17:53:58,"{""azure_parent_folder"": ""projects/LTBP/FY23/ex...","{""training_metrics"": {""auc"": 0.901687451479167...",BASELINE,True,False,False
1,Only here to see if it works delete at some po...,model_training,LocalRunTest,dev,random,2022-11-03 17:54:17,"{""azure_parent_folder"": ""projects/LTBP/FY23/ex...","{""training_metrics"": {""auc"": 0.923350305191059...",NOHYPEROPT,True,False,False


If this is your first time then you should not have any production models created for this project already

In [None]:
#| skip
sf.run_sql_str('''SELECT * 
FROM MACHINELEARNINGOUTPUTS.LTBP.LTBP_MODEL_TRACKING_FY23
WHERE PRODUCTION_MODEL
''')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,ACTION_DESCRIPTION,TRANSACTION_TYPE,COMMITID,ENVIRONMENT,BRANCH,TIMESTAMP,ARTIFACTS,METRICS,EXPERIMENT_NAME,EXPERIMENT,PRODUCTION_MODEL,EVER_PRODUCTION


So to promote this model we will need to update it and an easy way to do this is with the UPDATE command

There are of course other ways to do this in type of modeling approach you can come up with a custom post analysis work stream that promotes the model that is the best base on w/e kind of metrics you want, but this is how you can do it in the simplest approach

Here we are pushing two models to production to make sure the process works for two models that you may want to have this ability for an ensemble or a specific category of guest that the model works well for so that you can build a view on top of this table to allow the business to query off

In [None]:
#| skip
sf.run_sql_str('''UPDATE MACHINELEARNINGOUTPUTS.LTBP.LTBP_MODEL_TRACKING_FY23
SET PRODUCTION_MODEL = True
WHERE COMMITID = 'LocalRunTest'
AND EXPERIMENT_NAME = 'BASELINE'
AND EXPERIMENT = 'True'
AND CAST(TIMESTAMP as VARCHAR) = '2022-11-03 17:53:58'
''')

sf.run_sql_str('''UPDATE MACHINELEARNINGOUTPUTS.LTBP.LTBP_MODEL_TRACKING_FY23
SET PRODUCTION_MODEL = True
WHERE COMMITID = 'LocalRunTest'
AND EXPERIMENT_NAME = 'NOHYPEROPT'
AND EXPERIMENT = 'True'
AND CAST(TIMESTAMP as VARCHAR) = '2022-11-03 17:54:17'
''')

sf.run_sql_str('''SELECT * 
FROM MACHINELEARNINGOUTPUTS.LTBP.LTBP_MODEL_TRACKING_FY23
WHERE PRODUCTION_MODEL
''')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,ACTION_DESCRIPTION,TRANSACTION_TYPE,COMMITID,ENVIRONMENT,BRANCH,TIMESTAMP,ARTIFACTS,METRICS,EXPERIMENT_NAME,EXPERIMENT,PRODUCTION_MODEL,EVER_PRODUCTION
0,Standard baseline xgb_hyperopt approach status...,model_training,LocalRunTest,dev,random,2022-11-03 17:53:58,"{""azure_parent_folder"": ""projects/LTBP/FY23/ex...","{""training_metrics"": {""auc"": 0.901687451479167...",BASELINE,True,True,False
1,Only here to see if it works delete at some po...,model_training,LocalRunTest,dev,random,2022-11-03 17:54:17,"{""azure_parent_folder"": ""projects/LTBP/FY23/ex...","{""training_metrics"": {""auc"": 0.923350305191059...",NOHYPEROPT,True,True,False


**Important**

Never forget to set your other model to a no longer production model, but a past production model

In [None]:
#| skip
sf.run_sql_str('''UPDATE MACHINELEARNINGOUTPUTS.LTBP.LTBP_MODEL_TRACKING_FY23
SET PRODUCTION_MODEL = False, EVER_PRODUCTION = True
WHERE COMMITID = 'LocalRunTest'
AND EXPERIMENT_NAME = 'BASELINE'
AND EXPERIMENT = 'True'
AND CAST(TIMESTAMP as VARCHAR) = '2022-11-03 12:11:18' --Here to show what it might look like
''')

sf.run_sql_str('''SELECT * 
FROM MACHINELEARNINGOUTPUTS.LTBP.LTBP_MODEL_TRACKING_FY23
WHERE EVER_PRODUCTION
''')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,ACTION_DESCRIPTION,TRANSACTION_TYPE,COMMITID,ENVIRONMENT,BRANCH,TIMESTAMP,ARTIFACTS,METRICS,EXPERIMENT_NAME,EXPERIMENT,PRODUCTION_MODEL,EVER_PRODUCTION


## Production Level Run 

Awesome, there is a model that is ready to be predicted on lets go ahead and show what would be needed to run an inference only pipeline

In [None]:
#| skip
! data_creation  --train_or_inference "INFERENCE" --experiment_name "BASELINE" 

  warn_incompatible_dep(
INFO:root:This is a production run
INFO:root:Loading Yaml Files..
INFO:root:Generating Feature Set Query
INFO:root:static features in data set: 
 ['DESTINATIONGEOAFINITYLABEL', 'GenderCode']
INFO:root:temporal features in data set: 
 ['Age', 'AvgVisitPerSeason', 'BoughtPass', 'EverCorePass', 'EverPass', 'GuestBehavior', 'IsEpicMixActivated', 'MarketingZone', 'MostCommonTicketComp', 'MostSubSeasonVisited', 'MostVisitedRegion', 'MostVisitedResort', 'OnlySingleResortKey', 'PartnerResortScannerFlag', 'ResortsVisited', 'SkierabilityLabel', 'SubSeasonsPerYear', 'TotalSeasonsScanned', 'TotalVisits', 'VisitMostInPeak']
INFO:root:Appending static feature DESTINATIONGEOAFINITYLABEL to query
INFO:root:Appending static feature GenderCode to query
INFO:root:Finished appending static features
INFO:root:reading inference_base.sql for base query...
INFO:root:final query output: 
 select
base.*
, joined.DESTINATIONGEOAFINITYLABEL
, joined.GenderCode
, MACHINELEARNINGFEATURES.PR

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:root:data has been delivered from sf to adls


In [None]:
#| skip
! data_creation  --train_or_inference "INFERENCE" --experiment_name "NOHYPEROPT" 

  warn_incompatible_dep(
INFO:root:This is a production run
INFO:root:Loading Yaml Files..
INFO:root:Generating Feature Set Query
INFO:root:static features in data set: 
 ['DESTINATIONGEOAFINITYLABEL', 'GenderCode']
INFO:root:temporal features in data set: 
 ['Age', 'AvgVisitPerSeason', 'BoughtPass', 'EverCorePass', 'EverPass', 'GuestBehavior', 'IsEpicMixActivated', 'MarketingZone', 'MostCommonTicketComp', 'MostSubSeasonVisited', 'MostVisitedRegion', 'MostVisitedResort', 'OnlySingleResortKey', 'PartnerResortScannerFlag', 'ResortsVisited', 'SkierabilityLabel', 'SubSeasonsPerYear', 'TotalSeasonsScanned', 'TotalVisits', 'VisitMostInPeak']
INFO:root:Appending static feature DESTINATIONGEOAFINITYLABEL to query
INFO:root:Appending static feature GenderCode to query
INFO:root:Finished appending static features
INFO:root:reading inference_base.sql for base query...
INFO:root:final query output: 
 select
base.*
, joined.DESTINATIONGEOAFINITYLABEL
, joined.GenderCode
, MACHINELEARNINGFEATURES.PR

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:root:data has been delivered from sf to adls


In [None]:
#| skip
! model_inference --sfSchema 'LTBP'

  warn_incompatible_dep(
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.snowflake.query:Drop statement executed successfully (LTBP_PREDICTIONS_FY23 already dropped).
INFO:data_system_utilities.snowflake.utils:stage_query: 
 create or replace stage ltbpFY23LocalRunTest
url='azure://vaildtscadls.blob.core.windows.net/vailadls/projects/LTBP/FY23/LocalRunTest/BASELINE/'
credentials=(azure_sas_token='**MASKED**')
encryption=(

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:root:Preview dataframe queried        ECID SEASONYEAR  AGE  ... TOTALSEASONSSCANNED TOTALVISITS VISITMOSTINPEAK
0   5224121    2021/22   64  ...                   0           0               0
1  65773479    2021/22   23  ...                   0           0               0
2  57541763    2021/22  119  ...                   0           0               0
3  49372383    2021/22   46  ...                   0           0               0
4  58111328    2021/22    5  ...                   0           0               0

[5 rows x 24 columns]
INFO:root:Loading Sklearn Object to: ./models/train_xgb_basicLocalRunTestNOHYPEROPT.pkl
INFO:data_system_utilities.azure.storage:Downloading projects/LTBP/FY

### Results 

In [None]:
#| skip
sf.run_sql_str('''SELECT * 
FROM MACHINELEARNINGOUTPUTS.LTBP.LTBP_PREDICTIONS_FY23
ORDER BY DATE_CREATED DESC
LIMIT 5
''')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,ECID,SEASONYEAR,PROBABILITY,CI_COMMIT_SHA,DATE_CREATED,EXPERIMENT
0,49372383,2021/22,0.02007,LocalRunTest,2022-11-04 07:54:33,NOHYPEROPT
1,65773479,2021/22,0.028624,LocalRunTest,2022-11-04 07:54:33,NOHYPEROPT
2,5224121,2021/22,0.060644,LocalRunTest,2022-11-04 07:54:33,NOHYPEROPT
3,58111328,2021/22,0.028791,LocalRunTest,2022-11-04 07:54:33,NOHYPEROPT
4,57541763,2021/22,0.037379,LocalRunTest,2022-11-04 07:54:33,NOHYPEROPT


In [None]:
#| skip
sf.run_sql_str('''SELECT DISTINCT CI_COMMIT_SHA, DATE_CREATED, EXPERIMENT
FROM MACHINELEARNINGOUTPUTS.LTBP.LTBP_PREDICTIONS_FY23
ORDER BY DATE_CREATED DESC
LIMIT 5
''')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,CI_COMMIT_SHA,DATE_CREATED,EXPERIMENT
0,LocalRunTest,2022-11-04 07:54:33,NOHYPEROPT
1,LocalRunTest,2022-11-04 07:54:23,BASELINE


If this table populates you can think about how an post analysis experiment could work meaning you can do a prediction average ensemble between the models that are production level and weigh them depending on your confidence there is also an ensemble trainer that can be used, but isn't covered here

In [None]:
#| skip
sf.run_sql_str('''SELECT ECID, COUNT(1) Total
FROM MACHINELEARNINGOUTPUTS.LTBP.LTBP_PREDICTIONS_FY23
GROUP BY ECID
HAVING COUNT(1) > 2
LIMIT 5
''')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,ECID,TOTAL


# Want to Restart?

Here are all the commands to start over

TODO: put all the tear down commands to this here

In [None]:
#| skip
# sf.run_sql_str('DROP TABLE MACHINELEARNINGOUTPUTS.DEV.LTBP_PREDICTIONS_FY23')
# sf.run_sql_str('DROP TABLE MACHINELEARNINGOUTPUTS.LTBP.LTBP_MODEL_TRACKING_FY23')
# sf.run_sql_str('DROP TABLE MACHINELEARNINGOUTPUTS.LTBP.LTBP_PREDICTIONS_FY23')

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.snowflake.query:LTBP_PREDICTIONS_FY23 successfully dropped.
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.snowflake.query:LTBP_MODEL_TRACKING_FY23 successfully dropped.
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system

Unnamed: 0,status
0,LTBP_PREDICTIONS_FY23 successfully dropped.


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()