In [1]:
#default_exp modeling.inference

In [2]:
#hide
from nbdev.showdoc import *
from sklearn import datasets
from sdsde.azure.filehandling import FileHandling
from sdsde.snowflake.query import SnowflakeConnect
from datetime import datetime as dt

import pandas as pd
import datetime

# Inference Functionality

These functions are designed to help with anything in the Inference stage of the ML life cycle.

In [3]:
#export
import os
import pickle
import pyarrow
import shutil
import pyarrow.parquet as pq
import logging
import numpy as np

from sdsde.wrapper.azurewrapper import blob_pusher, blob_puller
from sdsde.modeling.premodel import make_data_lake_stage

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Model Pulling

### `pull_sklearn_object_from_data_lake`

In [4]:
#export


def pull_sklearn_object_from_data_lake(file_name: str, path: str, container: str, connection_str: str):
    """pulls a pickeld sklearn object from azure data lake to memory

    Args:
    * file_name (str): name of file
    * path (str): data lake path
    * container (str): data lake container
    * connection_str (str): azure connection string for the account

    Returns:
    * (sklearn object): sklearn object loaded from azure
    """
    logger.info(f'Loading Sklearn Object: {os.path.join(path, file_name)}')
    blob_puller(files=[os.path.join(path, file_name)],
                connection_str=connection_str,
                container_name=container,
                drop_location='.',
                overwrite=True)
    with open(file_name, 'rb') as f:
        pipeline = pickle.load(f)
    os.unlink(file_name)
    logger.info('Sklearn Object Loaded')
    return pipeline

In [5]:
show_doc(pull_sklearn_object_from_data_lake)

<h4 id="pull_sklearn_object_from_data_lake" class="doc_header"><code>pull_sklearn_object_from_data_lake</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>pull_sklearn_object_from_data_lake</code>(**`file_name`**:`str`, **`path`**:`str`, **`container`**:`str`, **`connection_str`**:`str`)

pulls a pickeld sklearn object from azure data lake to memory

Args:
* file_name (str): name of file
* path (str): data lake path
* container (str): data lake container
* connection_str (str): azure connection string for the account

Returns:
* (sklearn object): sklearn object loaded from azure

In [6]:
model = pull_sklearn_object_from_data_lake(file_name='RandomForestExample.pickle',
                                           path='dsde_library/testing/models/', 
                                           container='dsdetesting',
                                           connection_str=os.environ['DATALAKE_CONN_STR_SECRET'],
                                          )
model

INFO:__main__:Loading Sklearn Object: dsde_library/testing/models/RandomForestExample.pickle
INFO:sdsde.azure.filehandling:dsde_library/testing/models/RandomForestExample.pickle to ./RandomForestExample.pickle
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
INFO:__main__:Sklearn Object Loaded


RandomForestRegressor(max_depth=18)

## Model Inference

### `push_dataframe_to_data_lake_as_parquet`

In [7]:
#export


def push_dataframe_to_data_lake_as_parquet(df, path, container, connection_str,
                                           partition_cols: list = ["partitionidx"], overwrite=True):
    """takes a pandas dataframe and writes it to azure via pyarrow with parquet files

    Args:
    * df (pd.DataFame): dataframe
    * path (str): data lake path
    * container (str): data lake container
    * connection_str (str): azure connection string
    * partition_cols (list, optional): how to partition. fake partitions for speed make on default. Defaults to ["partitionidx"].
    * overwrite (bool, optional): do you overwrite what is there now. Defaults to True.
    """

    if os.path.exists(path):
        shutil.rmtree(path)
        logger.info(f'Removing existing files to write a new batch from {path}')

    if partition_cols[0] == "partitionidx":
        n_partition = int(np.ceil(df.shape[0] / 50000))
        df["partitionidx"] = np.random.choice(range(n_partition), size=df.shape[0])
        logger.info(f'Partitioning column created for distribution with {n_partition} partitions')

    table = pyarrow.Table.from_pandas(df, preserve_index=False)
    pq.write_to_dataset(table, root_path=path, partition_cols=partition_cols)
    logger.info('Parquet file staged in local disk memory')

    all_files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(path) for f in filenames if os.path.splitext(f)[1] == '.parquet']
    for file_name in all_files:
        logger.info(f'Moving File: {file_name}')
        blob_pusher(container_name=container,
                    connection_str=connection_str,
                    file_path=[file_name],
                    blob_dest=[os.path.dirname(file_name)],
                    overwrite=overwrite)

    shutil.rmtree(path)
    logger.info('Local parquet files removed')

In [8]:
show_doc(push_dataframe_to_data_lake_as_parquet)

<h4 id="push_dataframe_to_data_lake_as_parquet" class="doc_header"><code>push_dataframe_to_data_lake_as_parquet</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>push_dataframe_to_data_lake_as_parquet</code>(**`df`**, **`path`**, **`container`**, **`connection_str`**, **`partition_cols`**:`list`=*`['partitionidx']`*, **`overwrite`**=*`True`*)

takes a pandas dataframe and writes it to azure via pyarrow with parquet files

Args:
* df (pd.DataFame): dataframe
* path (str): data lake path
* container (str): data lake container
* connection_str (str): azure connection string
* partition_cols (list, optional): how to partition. fake partitions for speed make on default. Defaults to ["partitionidx"].
* overwrite (bool, optional): do you overwrite what is there now. Defaults to True.

In [9]:
data = datasets.load_boston()
df = pd.DataFrame(data['data'])
df.columns = data['feature_names']

time = np.random.randint(0,100000000)

push_dataframe_to_data_lake_as_parquet(df=df,
                                       path=f'dsde_library/testing/parquet/pyarrowpush/{time}', 
                                       container='dsdetesting', 
                                       connection_str=os.environ['DATALAKE_CONN_STR_SECRET'],
                                       partition_cols=['partitionidx']
                                      )


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this case special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows:

        from sklearn.datasets import fetch_californi

### `move_parquet_table_to_snowflake`

In [10]:
#export


def move_parquet_table_to_snowflake(sf_connection, table_name: str, stage_name: str,
                                    path: dict, columns_and_types: dict,
                                    pattern: str, replace_table: bool = True):
    """moves data sitting in a parquet format in ADLS to a snowflake table

    Args:
    * sf_connection (SnowflakeConnect): snowflake connection
    * table_name (str): table name
    * stage_name (str): snowflake stage name
    * path (str): path in ADLS to parquet data
    * columns_and_types (dict): snowflake column namees and types
    * pattern (str): pattern for reading files from ADLS
    * replace_table (bool, optional): true does create or relace, false does insert. Defaults to True.
    """

    if replace_table is False:
        select_query = f'''
        insert into {table_name}
            select
                FEATURES_HERE
            from @{stage_name + path} (pattern=>'{pattern}')
        '''
    else:
        select_query = f'''
        create or replace table {table_name} as
            select
                FEATURES_HERE
            from @{stage_name + path} (pattern=>'{pattern}')
        '''
    for k, v in columns_and_types.items():
        select_query = select_query.replace('FEATURES_HERE', f'$1:"{k}"::{v.upper()} as {k}, FEATURES_HERE')
    select_query = select_query.replace(', FEATURES_HERE', '')
    logger.info(select_query)
    sf_connection.run_str_query(select_query)

In [11]:
show_doc(move_parquet_table_to_snowflake)

<h4 id="move_parquet_table_to_snowflake" class="doc_header"><code>move_parquet_table_to_snowflake</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>move_parquet_table_to_snowflake</code>(**`sf_connection`**, **`table_name`**:`str`, **`stage_name`**:`str`, **`path`**:`dict`, **`columns_and_types`**:`dict`, **`pattern`**:`str`, **`replace_table`**:`bool`=*`True`*)

moves data sitting in a parquet format in ADLS to a snowflake table

Args:
* sf_connection (SnowflakeConnect): snowflake connection
* table_name (str): table name
* stage_name (str): snowflake stage name
* path (str): path in ADLS to parquet data
* columns_and_types (dict): snowflake column namees and types
* pattern (str): pattern for reading files from ADLS
* replace_table (bool, optional): true does create or relace, false does insert. Defaults to True.

In [15]:
sf = SnowflakeConnect(sfAccount=os.environ['sfAccount'],
                   sfUser=os.environ['sfUser'],
                   sfPswd=os.environ['sfPswd'],
                   sfWarehouse=os.environ['sfWarehouse'],
                   sfDatabase=os.environ['sfDatabase'],
                   sfSchema=os.environ['sfSchema'],
                   sfRole=os.environ['sfRole'])

make_data_lake_stage(sf_connection=sf, 
                     stage_name='sdsdetest',
                     account=os.environ['azure_account'], 
                     container='sdsdetesting', 
                     data_lake_path='sdsde_library/testing', 
                     sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'])

INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:connection to snowflake successful
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
INFO:sdsde.snowflake.query:Stage area SDSDETEST successfully created.


In [16]:
cols = {'ZN': 'number', 'INDUS': 'varchar'}
move_parquet_table_to_snowflake(sf_connection=sf, 
                                table_name='sdsdelibparquettest', 
                                stage_name='sdsdetest',
                                path=f'/parquet/pyarrowpush/{time}/',
                                columns_and_types=cols, 
                                pattern='.*.parquet',
                                replace_table = True)
move_parquet_table_to_snowflake(sf_connection=sf,
                                table_name='sdsdelibparquettest', 
                                stage_name='sdsdetest',
                                path=f'/parquet/pyarrowpush/{time}/',
                                columns_and_types=cols, 
                                pattern='.*.parquet',
                                replace_table = False)

INFO:__main__:
        create or replace table sdsdelibparquettest as
            select
                $1:"ZN"::NUMBER as ZN, $1:"INDUS"::VARCHAR as INDUS
            from @sdsdetest/parquet/pyarrowpush/5481230/ (pattern=>'.*.parquet')
        
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
INFO:sdsde.snowflake.query:Table SDSDELIBPARQUETTEST successfully created.
INFO:__main__:
        insert into sdsdelibparquettest
            select
                $1:"ZN"::NUMBER as ZN, $1:"INDUS"::VARCHAR as INDUS
            from @sdsdetest/parquet/pyarrowpush/5481230/ (pattern=>'.*.parquet')
        
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowfl

In [17]:
sf.run_str_query('DROP TABLE sdsdelibparquettest')

INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
INFO:sdsde.snowflake.query:SDSDELIBPARQUETTEST successfully dropped.


In [18]:
sf.run_str_query('DROP STAGE sdsdetest')

INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
INFO:sdsde.snowflake.query:SDSDETEST successfully dropped.


### `query_and_push_feature_set_to_data_lake`

When a feature set isn't in the feature store there will be times where this happens and we will want to be able to use very similar mechanics to what all other projects that are in the feature store have. This is where this method will come in handy.

In [19]:
#export


def query_and_push_feature_set_to_data_lake(sf_connection: object, query_file_path: str,
                                            stage_name: str, account: str,
                                            container: str, data_lake_path: str,
                                            blob_path: str, sas_token: str,
                                            connection_str: str, overwrite=True):
    """
    Take a in RAM data set and parition out data set into parquet files
    that are then sent to Azure Data Lake. This assumes that the feature
    store isn't being used for this project. The use case for this is
    to save training/test and predictions to azure that will then be
    sent to snowflake.

    Args:
    * sf_connection (SnowFlake Engine): Snowflake connection
    * query_file_path (str): Path to file to execute
    * stage_name (str): Stage Name for snowflake
    * account (str): Azure blob account name
    * container (str): Container in blob account
    * data_lake_path (str): root level for stage name allowing for re-use
    * blob_path (str): path in container to store data
    * sas_token (str): SAS token found in Access Keys
    * connection_str (str): connection str to azure blob found in Access Keys in Azure
    * overwrite (bool, optional): Overwrite files. Defaults to True.
    """
    logger.info('creating datalake staging area')
    make_data_lake_stage(sf_connection=sf_connection,
                         stage_name=stage_name,
                         account=account,
                         container=container,
                         data_lake_path=data_lake_path,
                         sas_token=sas_token)
    logger.info('begin query....')
    df = sf_connection.execute_file(query_file_path)
    df.columns = [x.lower() for x in df.columns]
    logger.info(f'query complete files being written to {data_lake_path}')
    push_dataframe_to_data_lake_as_parquet(df=df,
                                           container=container,
                                           path=blob_path,
                                           connection_str=connection_str,
                                           overwrite=overwrite)

In [20]:
show_doc(query_and_push_feature_set_to_data_lake)

<h4 id="query_and_push_feature_set_to_data_lake" class="doc_header"><code>query_and_push_feature_set_to_data_lake</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>query_and_push_feature_set_to_data_lake</code>(**`sf_connection`**:`object`, **`query_file_path`**:`str`, **`stage_name`**:`str`, **`account`**:`str`, **`container`**:`str`, **`data_lake_path`**:`str`, **`blob_path`**:`str`, **`sas_token`**:`str`, **`connection_str`**:`str`, **`overwrite`**=*`True`*)

Take a in RAM data set and parition out data set into parquet files
that are then sent to Azure Data Lake. This assumes that the feature
store isn't being used for this project. The use case for this is
to save training/test and predictions to azure that will then be
sent to snowflake.

Args:
* sf_connection (SnowFlake Engine): Snowflake connection
* query_file_path (str): Path to file to execute
* stage_name (str): Stage Name for snowflake
* account (str): Azure blob account name
* container (str): Container in blob account
* data_lake_path (str): root level for stage name allowing for re-use
* blob_path (str): path in container to store data
* sas_token (str): SAS token found in Access Keys
* connection_str (str): connection str to azure blob found in Access Keys in Azure
* overwrite (bool, optional): Overwrite files. Defaults to True.

# Create

In [22]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 01_azure.ipynb.
Converted 02_utils_dataframes.ipynb.
Converted 02_utils_parseyaml.ipynb.
Converted 02_utils_stfp.ipynb.
Converted 02_utils_traininghelpers.ipynb.
Converted 02_utils_traininghelpers_fastai.ipynb.
Converted 03_dstools_preparedata.ipynb.
Converted 04_snowflake_copyinto.ipynb.
Converted 04_snowflake_copyinto2.ipynb.
Converted 04_snowflake_query.ipynb.
Converted 05_azure_wrappers.ipynb.
Converted 06_modeling_inference.ipynb.
Converted 06_modeling_inference_fastai.ipynb.
Converted 06_modeling_premodel.ipynb.
Converted 06_modeling_preprocessing.ipynb.
Converted 06_modeling_preprocessing_fastai.ipynb.
Converted 06_modeling_training.ipynb.
Converted 06_modeling_training_fastai.ipynb.
Converted 07_Binary_Classification_Fastai_Example_Notebook.ipynb.
Converted index.ipynb.
