In [1]:
#default_exp modeling.preprocessing

In [2]:
#hide
from nbdev.showdoc import *
from sdsde import files
from sdsde.snowflake.query import SnowflakeConnect
from sdsde.wrapper.azurewrapper import blob_puller
from IPython.display import display, HTML

import pandas as pd
import json

# Preprocessing Functionality

These functions are designed to help with anything in the preprocessing stage of the ML life cycle.

In [3]:
#export
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer

import sklearn.preprocessing as sklearnpre
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Scaling/Transforming

### `generate_sklearn_preprocessing_pipeline`

In [4]:
#export


def get_cont_cols(df, cols):
    """
    helper function for making pipelines
    """
    return df[cols]


def get_cat_cols(df, cols):
    """
    helper function for making pipelines
    """
    return df[cols].astype(str)

In [5]:
#export


def generate_sklearn_preprocessing_pipeline(feature_dict, impute=True, impute_strategy='mean'):
    """Given a correctly formated feature dictionary, this function will create
    (without fitting) a sklearn pipeline for preprocessing. The
    function accepts a list of feature keys and transformer values. The
    specified transformers should be from the `sklearn.preprocessing` module,
    hence the name of the function. Arguments to the tranformer can also be passed
    in with the dictionary. Imputing boolean and strategy are also accepted.
    In yaml format, here would be an acceptable feature dictionary definition.

    ```
    MARKETINGZONE:
        variable_type:cont
        transformation:
            name: OrdinalEncoder
            args: {}
    ONLYSINGLERESORTKEY:
        variable_type:cont
        transformation:
            name:OneHotEncoder
            args:
                handle_unknown: ignore
    TOTALSEASONSSCANNED:
        variable_type:cont
        transformation:
            name:StandardScaler
            args: {}
    MAXSEASONVISITATIONSTREAK:
        variable_type:cont
        transformation:
            name:RobustScaler
            args: {}
    ```

    Args:
    * feature_dict (dict): definition for feature transformations
    * impute (bool): impute values at the end or not
    * impute_strategy (str): how to impute values. default is mean

    Returns:
    * object: sklearn feature union pipeline
    """
    logger.info('Creating Sklearn Preprocessing Pipeline')
    pipeline = []
    for feature in feature_dict:
        transformer = getattr(sklearnpre, feature_dict[feature]['transformation']['name'])(**feature_dict[feature]['transformation']['args'])
        logger.info(f'Feature: {feature} --> Transformer: {transformer}')
        if feature_dict[feature]['variable_type'] == 'cont':
            pipeline.append(make_pipeline(FunctionTransformer(get_cont_cols, kw_args={'cols': [feature]}, validate=False), transformer))
        else:
            pipeline.append(make_pipeline(FunctionTransformer(get_cat_cols, kw_args={'cols': [feature]}, validate=False), transformer))
    preprocess_pipeline = make_union(*pipeline)
    if impute:
        logger.info(f'Imputing missing data with {impute_strategy} strategy')
        preprocess_pipeline = Pipeline([
            ('preprocessing', preprocess_pipeline),
            ('imputing', SimpleImputer(strategy=impute_strategy))
        ])
    else:
        logger.info('No imputing for this pipeline')
    logger.info(f'Preprocessing Pipeline Object:\n{preprocess_pipeline}')
    return preprocess_pipeline

In [6]:
show_doc(generate_sklearn_preprocessing_pipeline)

<h4 id="generate_sklearn_preprocessing_pipeline" class="doc_header"><code>generate_sklearn_preprocessing_pipeline</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>generate_sklearn_preprocessing_pipeline</code>(**`feature_dict`**, **`impute`**=*`True`*, **`impute_strategy`**=*`'mean'`*)

Given a correctly formated feature dictionary, this function will create
(without fitting) a sklearn pipeline for preprocessing. The
function accepts a list of feature keys and transformer values. The
specified transformers should be from the `sklearn.preprocessing` module,
hence the name of the function. Arguments to the tranformer can also be passed
in with the dictionary. Imputing boolean and strategy are also accepted.
In yaml format, here would be an acceptable feature dictionary definition.

```
MARKETINGZONE:
    variable_type:cont
    transformation:
        name: OrdinalEncoder
        args: {}
ONLYSINGLERESORTKEY:
    variable_type:cont
    transformation:
        name:OneHotEncoder
        args:
            handle_unknown: ignore
TOTALSEASONSSCANNED:
    variable_type:cont
    transformation:
        name:StandardScaler
        args: {}
MAXSEASONVISITATIONSTREAK:
    variable_type:cont
    transformation:
        name:RobustScaler
        args: {}
```

Args:
* feature_dict (dict): definition for feature transformations
* impute (bool): impute values at the end or not
* impute_strategy (str): how to impute values. default is mean

Returns:
* object: sklearn feature union pipeline

In [7]:
feature_dict = {
    'MARKETINGZONE': {'transformation': {'name':'OneHotEncoder', 'args':{'handle_unknown': 'ignore', 'sparse':False}}, 
                      'variable_type': 'cat'},
    'ONLYSINGLERESORTKEY': {'transformation': {'name':'OrdinalEncoder', 'args':{'handle_unknown': 'use_encoded_value','unknown_value' : -1}}, 
                            'variable_type': 'cat'},
    'TOTALSEASONSSCANNED': {'transformation': {'name':'StandardScaler', 'args':{}}, 
                            'variable_type': 'cont'},
    'MAXSEASONVISITATIONSTREAK': {'transformation': {'name':'RobustScaler', 'args':{}}, 
                                  'variable_type': 'cont'},
}
logger.info(f'Feature Transformation Definition\n{json.dumps(feature_dict, indent=4)}')

pipe = generate_sklearn_preprocessing_pipeline(feature_dict, impute=True, impute_strategy='mean')

df = pd.DataFrame()
df = df.append([['CO', 1, 4, 3], [None, 2, 1, 3], ['TX', None, 0, 2], ['CO', 2, 5, 4], ['CO', None, 3, 1], ['TX', 1, None, 6], ['CO', None, 2, 3]])
df.columns = feature_dict.keys()
display(df)
df_transform = pd.DataFrame(pipe.fit_transform(df))
display(df_transform)

INFO:__main__:Feature Transformation Definition
{
    "MARKETINGZONE": {
        "transformation": {
            "name": "OneHotEncoder",
            "args": {
                "handle_unknown": "ignore",
                "sparse": false
            }
        },
        "variable_type": "cat"
    },
    "ONLYSINGLERESORTKEY": {
        "transformation": {
            "name": "OrdinalEncoder",
            "args": {
                "handle_unknown": "use_encoded_value",
                "unknown_value": -1
            }
        },
        "variable_type": "cat"
    },
    "TOTALSEASONSSCANNED": {
        "transformation": {
            "name": "StandardScaler",
            "args": {}
        },
        "variable_type": "cont"
    },
    "MAXSEASONVISITATIONSTREAK": {
        "transformation": {
            "name": "RobustScaler",
            "args": {}
        },
        "variable_type": "cont"
    }
}
INFO:__main__:Creating Sklearn Preprocessing Pipeline
INFO:__main__:Feature: MARKETINGZON

Unnamed: 0,MARKETINGZONE,ONLYSINGLERESORTKEY,TOTALSEASONSSCANNED,MAXSEASONVISITATIONSTREAK
0,CO,1.0,4.0,3
1,,2.0,1.0,3
2,TX,,0.0,2
3,CO,2.0,5.0,4
4,CO,,3.0,1
5,TX,1.0,,6
6,CO,,2.0,3


Unnamed: 0,0,1,2,3,4,5
0,1.0,0.0,0.0,0.0,0.87831,0.0
1,0.0,1.0,0.0,1.0,-0.87831,0.0
2,0.0,0.0,1.0,2.0,-1.46385,-1.0
3,1.0,0.0,0.0,1.0,1.46385,1.0
4,1.0,0.0,0.0,2.0,0.29277,-2.0
5,0.0,0.0,1.0,0.0,0.0,3.0
6,1.0,0.0,0.0,2.0,-0.29277,0.0


# Create

In [8]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 01_azure.ipynb.
Converted 02_utils_dataframes.ipynb.
Converted 02_utils_parseyaml.ipynb.
Converted 02_utils_stfp.ipynb.
Converted 02_utils_traininghelpers.ipynb.
Converted 02_utils_traininghelpers_fastai.ipynb.
Converted 03_dstools_preparedata.ipynb.
Converted 04_snowflake_copyinto.ipynb.
Converted 04_snowflake_copyinto2.ipynb.
Converted 04_snowflake_query.ipynb.
Converted 05_azure_wrappers.ipynb.
Converted 06_modeling_inference.ipynb.
Converted 06_modeling_inference_fastai.ipynb.
Converted 06_modeling_premodel.ipynb.
Converted 06_modeling_preprocessing.ipynb.
Converted 06_modeling_preprocessing_fastai.ipynb.
Converted 06_modeling_training.ipynb.
Converted 06_modeling_training_fastai.ipynb.
Converted 07_Binary_Classification_Fastai_Example_Notebook.ipynb.
Converted index.ipynb.
