In [3]:
#default_exp dstools.preparedata

In [4]:
#hide
from nbdev.showdoc import *

In [5]:
#hide
# imports needed for test
import numpy as np
import pandas as pd

# Data Prepare

In [6]:
#export
import pandas as pd
from pandas.api.types import is_numeric_dtype

In [7]:
# export


class EncodeCats:
    """
    How to Use:
    encode = EncodeCats()
    fixmissing = FixMissing()
    normalize = Normalize()
    df, categories = encode.apply_train(df, cat_vars)
    df, na_dict = fixmissing.apply_train(df, cont_vars, cat_vars, add_col=True)
    df, means, stds = normalize.apply_train(df, cont_vars)
    """

    def __init__(self):
        self.categories = {}

    def apply_train(self, df, cat_vars):
        """Transform cat_vars columns in categoricals"""
        for n in cat_vars:
            df.loc[:, n] = df.loc[:, n].astype('category').cat.as_ordered()
            self.categories[n] = df[n].cat.categories
        return df, self.categories

    @staticmethod
    def apply_test(df, cat_vars, categories):
        """Apply transform of cat_vars from training to test"""
        for n in cat_vars:
            df.loc[:, n] = pd.Categorical(df[n], categories=categories[n], ordered=True)   # noqa:

In [8]:
# export


class FixMissing:

    @staticmethod
    def apply_train(df, cont_vars, cat_vars, add_col=True):
        """Fill missing in cont_vars"""
        na_dict = {}
        for name in cont_vars:
            if pd.isnull(df[name]).sum():
                if add_col:
                    df[name + '_na'] = pd.isnull(df[name])
                filler = df[name].median()
                df[name] = df[name].fillna(filler)
                na_dict[name] = filler
        return df, na_dict

    @staticmethod
    def apply_test(df, cont_vars, cat_vars, na_dict, add_col=True):
        """Fill missing values in cont_vars like apply train"""
        for name in cont_vars:
            if name in na_dict:
                if add_col:
                    df[name + '_na'] = pd.isnull(df[name])
                df[name] = df[name].fillna(na_dict[name])
            elif pd.isnull(df[name]).sum() != 0:
                raise Exception(f"""There are nan values in field {name}.""")

In [9]:
# export


class Normalize:

    @staticmethod
    def apply_train(df, cont_vars):
        """Computer the means and stds of cont_name columns to normalize them"""   # noqa:
        means, stds = {}, {}
        for n in cont_vars:
            assert is_numeric_dtype(df[n]), f"""Can't normalize '{n}' column as it isn't numerical."""  # noqa:
            means[n], stds[n] = df[n].mean(), df[n].std()
            df[n] = (df[n] - means[n]) / (1e-7 + stds[n])
        return df, means, stds

    @staticmethod
    def apply_test(df, means, stds, cont_vars):
        """Normalize cont_vars with the same statistics in apply_train"""
        for n in cont_vars:
            df[n] = (df[n] - means[n]) / (1e-7 + stds[n])

In [10]:
# export


def numericalize(df, col, name, max_n_cat):
    """Numericalize is used to encode the categorical variables for model use"""   # noqa:
    if not is_numeric_dtype(col) and (max_n_cat is None or len(col.cat.categories) > max_n_cat):  # noqa:
        df[name] = col.cat.codes + 1

In [11]:
# export


def split_data(df, y_fld=None, max_n_cat=None):
    """split_data is the last step in preparing a data set for model ingestion"""   # noqa:
    df = df.copy()
    if y_fld is None:
        y = None
    else:
        if not is_numeric_dtype(df[y_fld]):
            df[y_fld] = df[y_fld].cat.codes
        y = df[y_fld].values
    df.drop(y_fld, axis=1, inplace=True)
    for n, c in df.items():
        numericalize(df, c, n, max_n_cat)
    return df, y

# How to use

In [12]:
encode = EncodeCats()
fixmissing = FixMissing()
normalize = Normalize()

In [13]:
dict1 = [{'ecid': 150, 'home': 'CA', 'avg_visits': 0.20, 'LTR': 6},
         {'ecid': 151, 'home': 'LA', 'avg_visits': np.nan, 'LTR': 2},
         {'ecid': 160, 'home': 'CO', 'avg_visits': 0.56, 'LTR': 4},
         {'ecid': 100, 'home': 'LA', 'avg_visits': 2.0, 'LTR': 3}]

dict2 = [{'ecid': 150, 'home': 'CA', 'avg_visits': 0.20,   'LTR': 6},
         {'ecid': 151, 'home': 'LA', 'avg_visits': 2.68,   'LTR': 2},
         {'ecid': 160, 'home': 'CO', 'avg_visits': np.nan, 'LTR': 4},
         {'ecid': 100, 'home': None, 'avg_visits': 2.0,    'LTR': 3}]

df = pd.DataFrame(dict1)
test = pd.DataFrame(dict2)

cat_vars = ['home']
cont_vars = ['ecid', 'avg_visits', 'LTR']

In [14]:
show_doc(EncodeCats.apply_train)

<h4 id="EncodeCats.apply_train" class="doc_header"><code>EncodeCats.apply_train</code><a href="__main__.py#L18" class="source_link" style="float:right">[source]</a></h4>

> <code>EncodeCats.apply_train</code>(**`df`**, **`cat_vars`**)

Transform cat_vars columns in categoricals

In [15]:
df, categories = encode.apply_train(df, cat_vars)

In [16]:
show_doc(EncodeCats.apply_test)

<h4 id="EncodeCats.apply_test" class="doc_header"><code>EncodeCats.apply_test</code><a href="__main__.py#L25" class="source_link" style="float:right">[source]</a></h4>

> <code>EncodeCats.apply_test</code>(**`df`**, **`cat_vars`**, **`categories`**)

Apply transform of cat_vars from training to test

In [17]:
encode.apply_test(test, cat_vars, categories)
assert (test[-1:].home.cat.codes == -1).all(), 'encode should make missing -1'

In [18]:
show_doc(FixMissing.apply_train)

<h4 id="FixMissing.apply_train" class="doc_header"><code>FixMissing.apply_train</code><a href="__main__.py#L6" class="source_link" style="float:right">[source]</a></h4>

> <code>FixMissing.apply_train</code>(**`df`**, **`cont_vars`**, **`cat_vars`**, **`add_col`**=*`True`*)

Fill missing in cont_vars

In [19]:
df, na_dict = fixmissing.apply_train(df, cont_vars, cat_vars, add_col=True)
assert df.avg_visits[1] == na_dict.get('avg_visits'), 'na_dict value applied to missing value'

In [20]:
show_doc(FixMissing.apply_test)

<h4 id="FixMissing.apply_test" class="doc_header"><code>FixMissing.apply_test</code><a href="__main__.py#L19" class="source_link" style="float:right">[source]</a></h4>

> <code>FixMissing.apply_test</code>(**`df`**, **`cont_vars`**, **`cat_vars`**, **`na_dict`**, **`add_col`**=*`True`*)

Fill missing values in cont_vars like apply train

In [21]:
fixmissing.apply_test(test, cont_vars, cat_vars, na_dict, add_col=True)
assert df.avg_visits[2] == na_dict.get('avg_visits'), 'na_dict value applied to missing value'

In [22]:
show_doc(Normalize.apply_train)

<h4 id="Normalize.apply_train" class="doc_header"><code>Normalize.apply_train</code><a href="__main__.py#L6" class="source_link" style="float:right">[source]</a></h4>

> <code>Normalize.apply_train</code>(**`df`**, **`cont_vars`**)

Computer the means and stds of cont_name columns to normalize them

In [23]:
df, means, stds = normalize.apply_train(df, cont_vars)

In [24]:
show_doc(Normalize.apply_test)

<h4 id="Normalize.apply_test" class="doc_header"><code>Normalize.apply_test</code><a href="__main__.py#L16" class="source_link" style="float:right">[source]</a></h4>

> <code>Normalize.apply_test</code>(**`df`**, **`means`**, **`stds`**, **`cont_vars`**)

Normalize cont_vars with the same statistics in apply_train

In [25]:
normalize.apply_test(test, means, stds, cont_vars)

assert (test.ecid.values == df.ecid.values).all()

In [26]:
df_trn, y_trn = split_data(df, 'LTR')

assert (df_trn.shape[1] + 1) == df.shape[1], 'removes dependent variable'
assert (df.LTR.values == y_trn).all(), 'spliting dependent variable'

# Create

In [27]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 01_azure.ipynb.
Converted 02_utils_dataframes.ipynb.
Converted 02_utils_parseyaml.ipynb.
Converted 02_utils_stfp.ipynb.
Converted 02_utils_traininghelpers.ipynb.
Converted 02_utils_traininghelpers_fastai.ipynb.
Converted 03_dstools_preparedata.ipynb.
Converted 04_snowflake_copyinto.ipynb.
Converted 04_snowflake_copyinto2.ipynb.
Converted 04_snowflake_query.ipynb.
Converted 05_azure_wrappers.ipynb.
Converted 06_modeling_inference.ipynb.
Converted 06_modeling_inference_fastai.ipynb.
Converted 06_modeling_premodel.ipynb.
Converted 06_modeling_preprocessing.ipynb.
Converted 06_modeling_preprocessing_fastai.ipynb.
Converted 06_modeling_training.ipynb.
Converted 06_modeling_training_fastai.ipynb.
Converted 07_Binary_Classification_Fastai_Example_Notebook.ipynb.
Converted 08_yaml_ingestion_binary_classification.ipynb.
Converted index.ipynb.
