# Inference Utils

> Inference utilities used in this project

In [None]:
#| default_exp inference.utils

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *
from LTBP.data.utils import get_yaml_dicts
from LTBP.modeling.utils import create_stage_and_query_stage_sf

In [None]:
#| export

from LTBP.data.utils import snowflake_query

from data_system_utilities.snowflake.copyinto import adls_url_to_sf_query_generator
from data_system_utilities.snowflake.utils import create_table_query_from_df
from data_system_utilities.azure.storage import FileHandling

import pickle
import os
import logging
import datetime
import shutil
import pytz


  warn_incompatible_dep(


In [None]:
#| export


def pull_sklearn_object_from_adls(adls_path: str,
                                  file_name: str,
                                  container_name: str,
                                  connection_str: str,
                                  drop_local_path: str = '.',
                                  clean_up: bool = True):
    """pulls a pickeld sklearn object from azure data lake to memory

    Args:
        file_name (str): name of file
        path (str): data lake path
        container (str): data lake container
        connection_str (str): azure connection string for the account

    Returns:
        (sklearn object): sklearn object loaded from azure
    """
    logging.info(f'Loading Sklearn Object to: {os.path.join(drop_local_path, file_name)}')

    if not os.path.exists(drop_local_path):
        os.makedirs(drop_local_path)

    fh = FileHandling(connection_str)
    fh.download_file(
        azure_file_path=adls_path+file_name,
        container_name=container_name,
        local_file_path=drop_local_path,
        overwrite=True
    )

    with open(os.path.join(drop_local_path, file_name), 'rb') as f:
        pipeline = pickle.load(f)
        logging.info('Sklearn Object Loaded')

    if clean_up:
        shutil.rmtree(drop_local_path)

    return pipeline

This is the DSDE standard process for using Xboost with hyperopt

In [None]:
#| export


def prediction_to_adls_and_sf(
    df,  # pandas dataframe to infer on
    sk_model_pipe,  # Sklearn Pipeline that brings preprocessing and modeling to data
    adls_path: str,  # adls root path
    models_dict: dict,  # model dict used through out the project classes would avoid this
    etl_dict: dict,  # etl dict used through the project
    experiment_name: str,  # name of experiment being ran
    sfSchema=os.getenv("sfSchema", "DEV"),  # defaults to enviornment variable or default
):
    """DEPERCATED WILL BREAK BACK custom to this project small changes to make it more flexible"""
    sf_df = df[models_dict['identification']].copy()
    # Change Here Name change for a regression and to predict or multi-labled needs some work
    sf_df['PROBABILITY'] = sk_model_pipe.predict_proba(df)[:, 1]
    del df
    date_created = datetime.datetime.now(pytz.timezone("US/Mountain")).strftime('%Y-%m-%d %H:%M:%S')
    sf_df['CI_COMMIT_SHA'] = os.environ.get('CI_COMMIT_SHA', 'LocalRunNBS')
    sf_df['DATE_CREATED'] = date_created
    sf_df['EXPERIMENT'] = experiment_name
    file_name = f"predictions_{os.environ.get('CI_COMMIT_SHA','LocalRunNBS')+experiment_name}.csv"
    # Saving as a .csv for simple reading from adls download using dask would be best here
    sf_df.to_csv(file_name, index=False)
    logging.info(f'preview predictions being added:\n{sf_df.head(3)}')
    logging.info(f'preview predictions values addes:\n{sf_df.iloc[0].values}')
    logging.info(f'preview predictions being added columns:\n{sf_df.columns}')
    az = FileHandling(os.environ[models_dict['connection_str']])
    az.upload_file(
        azure_file_path=os.path.join(adls_path,
                                     models_dict['predictions_adls_path'],
                                     models_dict[experiment_name]['model_trainer']),
        local_file_path=file_name,
        container_name=etl_dict['azure_container'],
        overwrite=True,
    )
    os.unlink(file_name)
    stage_url = f"azure://{etl_dict['azure_account']}.blob.core.windows.net/{etl_dict['azure_container']}/"
    preds_file_path = os.path.join(adls_path,
                                   models_dict['predictions_adls_path'],
                                   models_dict['BASELINE']['model_trainer'],
                                   file_name)

    sf = snowflake_query(sfSchema=sfSchema)
    if models_dict['inference_sf_table_name'].upper() not in sf.run_sql_str("show tables;").name.tolist():
        sf.run_sql_str(create_table_query_from_df(sf_df, table_name_sf=models_dict['inference_sf_table_name'], varchar=False))

    logging.info("Pushing Forecasted Season from ADLS to Snowflake")
    adls_query = adls_url_to_sf_query_generator(
        azure_path=os.path.join(stage_url, preds_file_path),
        azure_sas_token=os.environ[models_dict['sas_token']],
        table_name=models_dict['inference_sf_table_name'],
        database=sf.connection_inputs['database'],
        schema=sf.connection_inputs['schema'],
        skip_header='1',
        file_type='csv',
        pattern='.*.csv')
    sf.run_sql_str(adls_query)

    exp_table = sf.run_sql_str(f"""
    SELECT *
    FROM {models_dict['inference_sf_table_name']}
    WHERE DATE_CREATED = '{date_created}'
    AND EXPERIMENT = '{experiment_name}'
    LIMIT 3
    """)
    logging.info(f'preview of queried table being added:\n{exp_table.head(3)}')
    logging.info(f'preview predictions values addes:\n{exp_table.iloc[0].values}')

In [None]:
#| skip
experiment_name = 'BASELINE'
experiment = True
yaml_file_list = ['features.yaml', 'udf_inputs.yaml', 'etl.yaml', 'models.yaml']


features, udf_inputs, etl_dict, models_dict = get_yaml_dicts(yaml_file_list)

adls_path = os.path.join(
    (os.path.join(etl_dict['data_lake_path'], 'experiments', experiment_name)
      if experiment
      else os.path.join(
          etl_dict['data_lake_path'], os.environ.get('CI_COMMIT_SHA', 'LocalRunNBS'))))

model_name = (models_dict[experiment_name]['model_trainer']+
              os.environ.get('CI_COMMIT_SHA', 'LocalRunNBS')+
              experiment_name+'.pkl'
             )

model = pull_sklearn_object_from_adls(
        adls_path=os.path.join(adls_path,
                               models_dict['modeling_adls_path'],
                               models_dict[experiment_name]['model_trainer']
                              ) + '/',
        file_name=model_name,
        drop_local_path='./models/',
        container_name=etl_dict['azure_container'],
        connection_str=os.environ[models_dict['connection_str']]
    )

sf = snowflake_query()
df_infer = create_stage_and_query_stage_sf(
    sf=sf,
    etl=etl_dict,
    udf_inputs=udf_inputs,
    train_or_inference='INFERENCE',
    experiment_name=experiment_name,
    experiment=experiment,
    indentification=models_dict['identification'],
    extra_statement='LIMIT 1000'
)
logging.info(f'size of test set {df_infer.shape}')
logging.info(f'Preview inference data:\n{df_infer.head(2)}')
logging.info(f'Preview inference data values:\n{df_infer.iloc[0].values}')

logging.info('Begining on inference upload process')
prediction_to_adls_and_sf(
    df=df_infer,
    sk_model_pipe=model,
    adls_path=adls_path,
    models_dict=models_dict,
    etl_dict=etl_dict,
    experiment_name=experiment_name,
    sfSchema=os.getenv("sfSchema", "DEV")
)
logging.info(f'Inference stage complete for {experiment_name}')

INFO:root:Loading Sklearn Object to: ./models/train_xgbLocalRunTestBASELINE.pkl
INFO:data_system_utilities.azure.storage:Downloading projects/LTBP/FY23/experiments/BASELINE/modeling/train_xgb/train_xgbLocalRunTestBASELINE.pkl to ./models/train_xgbLocalRunTestBASELINE.pkl
INFO:data_system_utilities.azure.storage:Download complete
INFO:root:Sklearn Object Loaded
INFO:data_system_utilities.snowflake.utils:stage_query: 
 create or replace stage ltbpFY23LocalRunTest
url='azure://vaildtscadls.blob.core.windows.net/vailadls/projects/LTBP/FY23/experiments/BASELINE'
credentials=(azure_sas_token='**MASKED**')
encryption=(type= 'NONE')
file_format = (type = parquet        )
INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.quidenfication to snowflake has been turned off
INFO:data_system_utilities.snowf

INFO:data_system_utilities.snowflake.utils:connection to snowflake established...
INFO:data_system_utilities.snowflake.query:executing query
INFO:data_system_utilities.snowflake.query:data loaded from snowflake
INFO:data_system_utilities.snowflake.query:connection to snowflake has been turned off
INFO:root:preview of queried table being added:
       ECID SEASONYEAR  PROBABILITY CI_COMMIT_SHA         DATE_CREATED  \
0   4274289    2021/22     0.191546  LocalRunTest  2022-11-03 15:17:48   
1  83442101    2021/22     0.180928  LocalRunTest  2022-11-03 15:17:48   
2   7327662    2021/22     0.193184  LocalRunTest  2022-11-03 15:17:48   

  EXPERIMENT  
0   BASELINE  
1   BASELINE  
2   BASELINE  
INFO:root:preview predictions values addes:
['4274289' '2021/22' 0.19154589 'LocalRunTest' '2022-11-03 15:17:48'
 'BASELINE']
INFO:root:Inference stage complete for BASELINE


In [None]:
#| skip
# sf.run_sql_str(f"DROP TABLE {models_dict['tracking_table']}")
# sf.run_sql_str(f"DROP TABLE MACHINELEARNINGOUTPUTS.dev.{models_dict['hold_out_table']}")
# sf.run_sql_str(f"DROP TABLE MACHINELEARNINGOUTPUTS.dev.{table_name}")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()