In [1]:
#default_exp modeling.preprocessingfastai

In [2]:
#hide
from nbdev.showdoc import *
from sdsde import files
from sdsde.snowflake.query import SnowflakeConnect
from sdsde.wrapper.azurewrapper import blob_puller
from IPython.display import display, HTML

import json

# Preprocessing Functionality

These functions are designed to help with anything in the preprocessing stage of the ML life cycle.

In [3]:
#export
from sdsde.wrapper.azurewrapper import blob_pusher
from fastai.tabular.core import Categorify, FillMissing, Normalize, RandomSplitter, range_of, CategoryBlock, torch
from fastai.tabular.data import TabularDataLoaders
from fastai.tabular.all import distrib_barrier
from fastai.tabular.core import TabularPandas
from pathlib import Path
from fastcore.basics import patch

import warnings
import pickle
import os
import logging
import pandas as pd
import numpy as np


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

##  `generate_fastai_pytorch_dataloader`

For those that don't know Fastai is a great wrapper to allow anyone to use custom models and standard state of the art models and we are going to create a method that will create a standard Tabular Model. A tabular model is anything that we typically see in a .csv file. For example customer information and we want to be able to know will this person buy a passed based on what's in this customer's profile.

## ``export``

In [4]:
#export
@patch
def export(self: TabularPandas, fname='export.pkl', pickle_protocol=2):
    """
    Helper function it's a patch to fastai to allow the tabular preprocess
    to be pulled out and extraploated onto a new dataset with out the data
    this was a huge development.

    Args:
    * self (TabularPandas): TabularPandas
    * fname (str, optional): File Name and Path. Defaults to 'export.pkl'.
    * pickle_protocol (int, optional): Defaults to 2.
    """
    old_to = self
    self = self.new_empty()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pickle.dump(self, open(Path(fname), 'wb'), protocol=pickle_protocol)
        self = old_to

## ``load_pandas``

In [5]:
#export

def load_pandas(fname):
    "Load in a `TabularPandas` object from `fname`"
    distrib_barrier()
    res = pickle.load(open(fname, 'rb'))
    return res

## ``generate_fastai_pytorch_dataloader``

In [6]:
#export
def generate_fastai_pytorch_dataloader(df: pd.DataFrame,
                                       cat_vars: list, cont_vars: list, y_var: list,
                                       y_block=CategoryBlock(), y_range: float = None,
                                       bs: int = 254, val_pct: float = 0.2, seed=None,
                                       impute: bool = True, splits: list = None,
                                       procs: list = [Categorify, FillMissing, Normalize]):
    """
    Active Development with Sklearn Pipeline, but currently when using the fastai dataloader
    we are using the DataLoaderAPI as well as the TabularPandas functions. The reason that
    both are in here is to allow the user to export the preprocess process for a model outside
    of the Fastai ecosystem.

    See notebook for more information on this process.

    Args:
    * df (pd.DataFrame): [description]
    * y_block ([type], optional): [description]. Defaults to CategoryBlock().
    * y_range (float, optional): This is giving the range of a prediction for the model and is logged automatically and reported back to the user np.exp. Defaults to None.
    * bs (int, optional): Batch Size. Defaults to 254.
    * val_pct (float, optional): Validation Size. Defaults to 0.2.
    * seed ([type], optional): Seed For Split. Defaults to None.
    * impute (bool, optional): Sklearn Impute Function. Defaults to True.
    * procs (list, optional): Defaults to most common methods. Defaults to [Categorify, FillMissing, Normalize].

    Returns:
    * Fastai: dl_train, tab_train
    """
    if splits is None:
        splits = RandomSplitter(valid_pct=val_pct, seed=seed)(range_of(df))
        logger.info(f'Training Data Size {len(splits[0])}')
        logger.info(f'Validation Data Size {len(splits[1])}')
    logger.info(f'Categorical Variable(s) For Project {cat_vars}')
    logger.info(f'Continuous Variable(s) For Project {cont_vars}')
    logger.info(f'Dependent Variable(s) For Project {y_var}')
    logger.info('dataloader being created')
    if y_range is not None:
        max_log_y = np.log(np.max(df[y_var]*y_range))
        min_log_y = np.log(np.min(df[y_var]))
        y_range = torch.tensor([min_log_y, max_log_y], device=None)
        logger.info(f'Model Prediction Range {np.exp(y_range)}')

    tab_train = TabularPandas(df, procs=procs, cat_names=cat_vars,
                              cont_names=cont_vars,
                              y_names=y_var, y_block=y_block,
                              splits=splits)

    dl_train = (TabularDataLoaders.from_df(df, procs=procs, y_range=y_range,
                                           cat_names=cat_vars, cont_names=cont_vars,
                                           y_names=y_var, y_block=y_block,
                                           valid_idx=splits[1], bs=bs))
    logger.info(dl_train.train.xs.head())
    return dl_train, tab_train

In [7]:
from sklearn import datasets

load_breast_cancer = datasets.load_breast_cancer()
df = pd.DataFrame(load_breast_cancer['data'], columns=load_breast_cancer['feature_names'])
df['target'] = load_breast_cancer['target']

dl_train, tab_train = generate_fastai_pytorch_dataloader(df, cat_vars=[], cont_vars=list(load_breast_cancer['feature_names']), 
                                                         y_var=['target'], y_block=CategoryBlock(), y_range=None,
                                                         val_pct=0.4, bs=100, procs = [FillMissing, Normalize],
                                                         seed=123, splits=None)

INFO:__main__:Training Data Size 342
INFO:__main__:Validation Data Size 227
INFO:__main__:Categorical Variable(s) For Project []
INFO:__main__:Continuous Variable(s) For Project ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension']
INFO:__main__:Dependent Variable(s) For Project ['target']
INFO:__main__:dataloader being created
INFO:__main__:   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0     1.137686     -2.123748        1.322857   1.054084   

## ``prepare_test_pre_model``

In [8]:
#export
def prepare_test_pre_model(df: pd.DataFrame, dl: TabularDataLoaders = None, label: bool = False):
    """
    helper function that takes a tabular dataloader and returns a prepared dataloader for a new
    datas set

    Args:
    * df (pd.DataFrame): data frame
    * dl (TabularDataLoaders, optional): tabulardataloader. Defaults to None.
    * label (bool, optional): Does the data set have the label of interest. Defaults to False.

    Returns:
    * TabularDataLoader
    """
    dl_test = dl.test_dl(df, with_label=label)
    logger.info(f'dl test {dl_test.xs.head()}')
    return dl_test

In [9]:
show_doc(generate_fastai_pytorch_dataloader)

<h4 id="generate_fastai_pytorch_dataloader" class="doc_header"><code>generate_fastai_pytorch_dataloader</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>generate_fastai_pytorch_dataloader</code>(**`df`**:`DataFrame`, **`cat_vars`**:`list`, **`cont_vars`**:`list`, **`y_var`**:`list`, **`y_block`**=*`<fastai.data.block.TransformBlock object at 0x7f8b63be62e0>`*, **`y_range`**:`float`=*`None`*, **`bs`**:`int`=*`254`*, **`val_pct`**:`float`=*`0.2`*, **`seed`**=*`None`*, **`impute`**:`bool`=*`True`*, **`splits`**:`list`=*`None`*, **`procs`**:`list`=*`[<class 'fastai.tabular.core.Categorify'>, <class 'fastai.tabular.core.FillMissing'>, <class 'fastai.data.transforms.Normalize'>]`*)

Active Development with Sklearn Pipeline, but currently when using the fastai dataloader
we are using the DataLoaderAPI as well as the TabularPandas functions. The reason that
both are in here is to allow the user to export the preprocess process for a model outside
of the Fastai ecosystem.

See notebook for more information on this process.

Args:
* df (pd.DataFrame): [description]
* y_block ([type], optional): [description]. Defaults to CategoryBlock().
* y_range (float, optional): This is giving the range of a prediction for the model and is logged automatically and reported back to the user np.exp. Defaults to None.
* bs (int, optional): Batch Size. Defaults to 254.
* val_pct (float, optional): Validation Size. Defaults to 0.2.
* seed ([type], optional): Seed For Split. Defaults to None.
* impute (bool, optional): Sklearn Impute Function. Defaults to True.
* procs (list, optional): Defaults to most common methods. Defaults to [Categorify, FillMissing, Normalize].

Returns:
* Fastai: dl_train, tab_train

In [10]:
feature_dict = dict({
    'MAREKTINGZONE': {'variable_type' : 'y',
                      'change_dtype': 'no'},
    'ONLYSINGLERESORTKEY': {'variable_type' : 'cat',
                           'change_dtype': 'yes'},
    'TOTOALSEASONSSCANNED': {'variable_type' : 'cont',
                            'change_dtype': 'no'},
    'MAXSEASONVISITATIONSTREAK': {'variable_type' : 'cont',
                                  'change_dtype': 'no'}
})

df = pd.DataFrame()
df = df.append([['CO', 1, 4, 3], ['CO', 2, 1, 3], ['TX', None, 0, 2], ['CO', None, 5, 4], ['CO', None, 3, 1], ['TX', 1, None, 6], ['CO', None, 2, 3]]).copy()
df.columns = feature_dict.keys()
display(df)

Unnamed: 0,MAREKTINGZONE,ONLYSINGLERESORTKEY,TOTOALSEASONSSCANNED,MAXSEASONVISITATIONSTREAK
0,CO,1.0,4.0,3
1,CO,2.0,1.0,3
2,TX,,0.0,2
3,CO,,5.0,4
4,CO,,3.0,1
5,TX,1.0,,6
6,CO,,2.0,3


In [11]:
dl_train, tab_train = generate_fastai_pytorch_dataloader(df, y_block=CategoryBlock(), 
                                                         cat_vars = ['ONLYSINGLERESORTKEY'],
                                                         cont_vars = ['TOTOALSEASONSSCANNED', 'MAXSEASONVISITATIONSTREAK'],
                                                         y_var = ['MAREKTINGZONE'],  y_range=None,
                                                         val_pct=0.4, bs=2, procs = [Categorify, FillMissing, Normalize],
                                                         seed=123)

INFO:__main__:Training Data Size 5
INFO:__main__:Validation Data Size 2
INFO:__main__:Categorical Variable(s) For Project ['ONLYSINGLERESORTKEY']
INFO:__main__:Continuous Variable(s) For Project ['TOTOALSEASONSSCANNED', 'MAXSEASONVISITATIONSTREAK']
INFO:__main__:Dependent Variable(s) For Project ['MAREKTINGZONE']
INFO:__main__:dataloader being created
INFO:__main__:   ONLYSINGLERESORTKEY  TOTOALSEASONSSCANNED_na  TOTOALSEASONSSCANNED  \
1                    2                        1             -1.281423   
3                    0                        1              1.733690   
4                    0                        1              0.226133   
5                    1                        2             -0.150756   
6                    0                        1             -0.527645   

   MAXSEASONVISITATIONSTREAK  
1                  -0.246183  
3                   0.369274  
4                  -1.477098  
5                   1.600189  
6                  -0.246183  


In [12]:
df_test = pd.DataFrame()
df_test = df_test.append([['CO', 1, 4, 3], ['CO', 2, 1, 3], ['TX', None, 0, 2], ['CO', None, 5, 4], ['CO', None, 3, 1], ['TX', 1, None, 6], ['CO', None, 2, 3]]).copy()
df_test.columns = feature_dict.keys()

In [13]:
tab_train.export('transformer.pkl')

In [14]:
to_load = load_pandas('./transformer.pkl')

In [15]:
to_new = to_load.train.new(df_test)

In [16]:
to_new.process()

In [17]:
to_new.xs.head()

Unnamed: 0,ONLYSINGLERESORTKEY,TOTOALSEASONSSCANNED_na,TOTOALSEASONSSCANNED,MAXSEASONVISITATIONSTREAK
0,1,1,0.979912,-0.246183
1,2,1,-1.281423,-0.246183
2,0,1,-2.035201,-0.86164
3,0,1,1.73369,0.369274
4,0,1,0.226133,-1.477098


Even though there is a con with this method using the Fastai Dataloader method is that you have to bring in potentially a very large data frame into memory to and I have a question on the forum that will see if we can take the dataset out and save the preprocessing piece, but I am not sure this will happen time will tell. So if you don't plan on using a Tabular Learner or a custom Tabluar Model then this method shouldn't be used in its current state

In [18]:
df_test.columns

Index(['MAREKTINGZONE', 'ONLYSINGLERESORTKEY', 'TOTOALSEASONSSCANNED',
       'MAXSEASONVISITATIONSTREAK'],
      dtype='object')

In [19]:
df_test = pd.DataFrame()
df_test = df_test.append([[np.NaN, np.NaN, 20], [10, 1, 3], [np.NaN, np.NaN, 4]]).copy()
df_test.columns = ['ONLYSINGLERESORTKEY', 'TOTOALSEASONSSCANNED', 'MAXSEASONVISITATIONSTREAK']

In [20]:
to_new = to_load.train.new(df_test)

In [21]:
to_new.process()

In [22]:
to_new.xs.head()

Unnamed: 0,ONLYSINGLERESORTKEY,TOTOALSEASONSSCANNED_na,TOTOALSEASONSSCANNED,MAXSEASONVISITATIONSTREAK
0,0,2,-0.150756,10.216593
1,0,1,-1.281423,-0.246183
2,0,2,-0.150756,0.369274


In [23]:
dl_test = prepare_test_pre_model(df=df_test, dl=dl_train, label=False)

INFO:__main__:dl test    ONLYSINGLERESORTKEY  TOTOALSEASONSSCANNED_na  TOTOALSEASONSSCANNED  \
0                    0                        2             -0.150756   
1                    0                        1             -1.281423   
2                    0                        2             -0.150756   

   MAXSEASONVISITATIONSTREAK  
0                  10.216593  
1                  -0.246183  
2                   0.369274  


In [24]:
from sdsde.azure.filehandling import unlink_files
unlink_files(['./transformer.pkl'])

### ``save_fastai_preprocess_to_data_lake``

In [25]:
#export
def save_fastai_preprocess_to_data_lake(preprocesser, file_name: str, path: str,
                                        container: str, connection_str: str, overwrite: bool = False):
    """
    push preprocess object to azure datalake

    Args:
    * preprocesser (object): preprocessor
    * file_name (str): filename
    * path (str): path to save file
    * container (str): container name
    * connection_str (str): azure connection string
    * overwrite (bool, optional): overwrite files. Defaults to False.
    """
    logger.info(f'Pushing Fastai Preprocesser Object to Azure: {os.path.join(path, file_name)}')
    preprocesser.export(file_name)
    blob_pusher(container_name=container,
                connection_str=connection_str,
                file_path=[file_name],
                blob_dest=[path],
                overwrite=overwrite)
    os.unlink(file_name)

### ```save_dataloader_to_data_lake```

In [26]:
#export


def save_dataloader_to_data_lake(dl, file_name: str, path: str,
                                 container: str, connection_str: str, overwrite: bool = False):
    """
    push preprocess object to azure datalake

    Args:
    * dl (object): dataloader
    * file_name (str): filename
    * path (str): path to save file
    * container (str): container name
    * connection_str (str): azure connection string
    * overwrite (bool, optional): overwrite files. Defaults to False.
    """
    logger.info(f'Pushing DataLoader Object to Azure: {os.path.join(path, file_name)}')
    torch.save(dl, file_name)
    blob_pusher(container_name=container,
                connection_str=connection_str,
                file_path=[file_name],
                blob_dest=[path],
                overwrite=overwrite)
    os.unlink(file_name)

# Create

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 01_azure.ipynb.
Converted 02_utils_dataframes.ipynb.
Converted 02_utils_parseyaml.ipynb.
Converted 02_utils_stfp.ipynb.
Converted 02_utils_traininghelpers.ipynb.
Converted 02_utils_traininghelpers_fastai.ipynb.
Converted 03_dstools_preparedata.ipynb.
Converted 04_snowflake_copyinto.ipynb.
Converted 04_snowflake_copyinto2.ipynb.
Converted 04_snowflake_query.ipynb.
Converted 05_azure_wrappers.ipynb.
