In [None]:
# hide
# default_exp blocks.preprocessing
import os
from nbdev.showdoc import *
if not os.path.exists('settings.ini'):
    os.chdir('..')

# Preprocessing components

> Preprocessing components like one hot encoder

In [None]:
#export
import pandas as pd
import sklearn.preprocessing
from block_types.core.block_types import PandasComponent
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError

In [None]:
#export
class OneHotEncoder (PandasComponent):
    def __init__ (self, 
                  categories='auto',
                  handle_unknown='ignore',
                  dtype=int,
                  **kwargs):
        super().__init__ (**kwargs)
        self.categories = categories
        self.handle_unknown = handle_unknown
        self.one_hot_encoder = sklearn.preprocessing.OneHotEncoder(categories=categories,
                                                                    handle_unknown=handle_unknown,
                                                                    sparse=False)
        self.dtype = dtype
        
    def _fit (self, X, y=None):
        self.one_hot_encoder.fit (X)
        return self
    
    def _apply (self, df):
        try:
            check_is_fitted (self.one_hot_encoder)
        except NotFittedError:
            if self.categories != 'auto':
                self.one_hot_encoder.fit(df)
            else:
                raise NotFittedError('OneHotEncoder must be fitted first')
        X = self.one_hot_encoder.transform(df)
        if self.dtype is int or self.dtype == 'int':
            X = X.astype(int)
        df = pd.DataFrame (data=X, 
                           columns=self.one_hot_encoder.get_feature_names(input_features=df.columns),
                           index=df.index)
        return df

In [None]:
df = pd.DataFrame({'x1':['b','b','a','b','a'],
                   'x2':['e','c','d','e','f'],
                  })
one_hot_encoder = OneHotEncoder(categories=[['a','b'],['c','d','e']])
dfr = one_hot_encoder.transform (df)

# show result
print ('result:')
display (dfr)

# compare against pd.get_dummies, removing last column corresponding to category 'f'
df_dummies = pd.get_dummies (df).iloc[:,:-1] 
assert (dfr==df_dummies).all().all()

In [None]:
import pytest 
# test categories='auto'
one_hot_encoder = OneHotEncoder(categories='auto')

# 1.- if categories='auto', we must call fit  before calling transform:
with pytest.raises(Exception):
    dfr = one_hot_encoder.transform (df)

# test that 'auto' returns the same as pd.get_dummies
dfr = one_hot_encoder.fit_transform (df)
print ('result:'); display(dfr)
assert (dfr==pd.get_dummies (df)).all().all()