# Inference Script

> Inference utilities used in this project

In [None]:
#| default_exp scripts.inference

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#| export
from fastcore.script import Param, call_parse

from LTBP.modeling.utils import create_stage_and_query_stage_sf
from LTBP.data.utils import snowflake_query, get_yaml_dicts
from LTBP.inference.utils import pull_sklearn_object_from_adls, prediction_to_adls_and_sf

import os
import logging

# Script

This is the DSDE standard process for using Xboost with hyperopt

In [None]:
#| export


@call_parse
def model_inference(
    yaml_file_list: Param(help="YAML files to read", type=list,  # noqa:
                      default=['features.yaml', 'udf_inputs.yaml', 'etl.yaml', 'models.yaml']),  # noqa:
    experiment_name: Param(help="tell function what experiment is being ran", type=str, default='BASELINE'),  # noqa:
    experiment: Param(help="add experiment state it is not an experiment", type=bool, default=True),  # noqa:
    sfSchema: Param(help="dev queries dev schema anything else will query project schema", type=str, default='dev')  # noqa:
    ):  # noqa:

    _, udf_inputs, etl_dict, models_dict = get_yaml_dicts(yaml_file_list)

    adls_path = os.path.join(
        (os.path.join(etl_dict['data_lake_path'], 'experiments', experiment_name)
         if experiment
         else os.path.join(
            etl_dict['data_lake_path'], os.environ.get('CI_COMMIT_SHA', 'LocalRunNBS'))))

    model_name = (models_dict[experiment_name]['model_trainer']
                  + os.environ.get('CI_COMMIT_SHA', 'LocalRunNBS')
                  + experiment_name+'.pkl'
                  )

    model = pull_sklearn_object_from_adls(
        adls_path=os.path.join(adls_path,
                               models_dict['modeling_adls_path'],
                               models_dict[experiment_name]['model_trainer']
                               ) + '/',
        file_name=model_name,
        drop_local_path='./models/',
        container_name=etl_dict['azure_container'],
        connection_str=os.environ[models_dict['connection_str']]
    )

    sf = snowflake_query()
    df_infer = create_stage_and_query_stage_sf(
        sf=sf,
        etl=etl_dict,
        udf_inputs=udf_inputs,
        train_or_inference='INFERENCE',
        experiment_name=experiment_name,
        experiment=experiment,
        indentification=models_dict['identification'],
        extra_statement='LIMIT 1000'  # Can add limit when experimenting 'LIMIT 1000'
    )
    logging.info(f'size of test set {df_infer.shape}')
    logging.info(f'Preview inference data:\n{df_infer.head(2)}')
    logging.info(f'Preview inference data values:\n{df_infer.iloc[0].values}')

    logging.info('Begining on inference upload process')
    prediction_to_adls_and_sf(
        df=df_infer,
        sk_model_pipe=model,
        adls_path=adls_path,
        models_dict=models_dict,
        etl_dict=etl_dict,
        experiment_name=experiment_name,
        sfSchema=sfSchema
    )
    logging.info(f'Inference stage complete for {experiment_name}')

# Local Development

In [None]:
#| skip
experiment_name = 'BASELINE'
experiment = True
yaml_file_list = ['features.yaml', 'udf_inputs.yaml', 'etl.yaml', 'models.yaml']
features, udf_inputs, etl_dict, models_dict = get_yaml_dicts(yaml_file_list)
adls_path = os.path.join(
    (os.path.join(etl_dict['data_lake_path'], 'experiments', experiment_name)
      if experiment
      else os.path.join(
          etl_dict['data_lake_path'], os.environ.get('CI_COMMIT_SHA', 'LocalRunNBS'))))

model_name = (models_dict[experiment_name]['model_trainer']+
              os.environ.get('CI_COMMIT_SHA', 'LocalRunNBS')+
              experiment_name+'.pkl'
             )

model = pull_sklearn_object_from_adls(
        adls_path=os.path.join(adls_path,
                               models_dict['modeling_adls_path'],
                               models_dict[experiment_name]['model_trainer']
                              ) + '/',
        file_name=model_name,
        drop_local_path='./models/',
        container_name=etl_dict['azure_container'],
        connection_str=os.environ[models_dict['connection_str']]
    )

sf = snowflake_query()
df_infer = create_stage_and_query_stage_sf(
    sf=sf,
    etl=etl_dict,
    udf_inputs=udf_inputs,
    train_or_inference='INFERENCE',
    experiment_name=experiment_name,
    experiment=experiment,
    indentification=models_dict['identification'],
    extra_statement='LIMIT 1000'
)
logging.info(f'size of test set {df_infer.shape}')
logging.info(f'Preview inference data:\n{df_infer.head(2)}')
logging.info(f'Preview inference data values:\n{df_infer.iloc[0].values}')

logging.info('Begining on inference upload process')
prediction_to_adls_and_sf(
    df=df_infer,
    sk_model_pipe=model,
    adls_path=adls_path,
    models_dict=models_dict,
    etl_dict=etl_dict,
    experiment_name=experiment_name,
    sfSchema=os.getenv("sfSchema", "DEV")
)
logging.info(f'Inference stage complete for {experiment_name}')

INFO:root:Loading Sklearn Object to: ./models/train_xgbLocalRunTestBASELINE.pkl
INFO:data_system_utilities.azure.storage:Downloading projects/LTBP/FY23/experiments/BASELINE/modeling/train_xgb/train_xgbLocalRunTestBASELINE.pkl to ./models/train_xgbLocalRunTestBASELINE.pkl
INFO:data_system_utilities.azure.storage:Download complete
INFO:root:Sklearn Object Loaded
INFO:data_system_utilities.snowflake.utils:stage_query: 
 create or replace stage ltbpFY23LocalRunTest
url='azure://vaildtscadls.blob.core.windows.net/vailadls/projects/LTBP/FY23/experiments/BASELINE'
credentials=(azure_sas_token='**MASKED**')
encryption=(type= 'NONE')
file_format = (type = parquet        )
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:data_system_utilities.sno

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:root:preview of queried table being added:
       ECID SEASONYEAR  PROBABILITY CI_COMMIT_SHA         DATE_CREATED  \
0  75277491    2021/22     0.100722  LocalRunTest  2022-11-02 18:15:18   
1  57932090    2021/22     0.135460  LocalRunTest  2022-11-02 18:15:18   
2  40834443    2021/22     0.117061  LocalRunTest  2022-11-02 18:15:18   

  EXPERIMENT  
0   BASELINE  
1   BASELINE  
2   BASELINE  
INFO:root:preview predictions values addes:
['75277491' '2021/22' 0.10072208 'LocalRunTest' '2022-11-02 18:15:18'
 'BASELINE']
INFO:root:Inference stage complete for BASELINE


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()