In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from custom_ml_toolkit.preprocessor.encoder import SupportMissingOneHotEncoder, SupportMissingCategoricalEncoder, SupportMissingDatasetEncoder

## SupportMissingOneHotEncoder

In [2]:
print('-----------------------------------------------------')
data_df = pd.DataFrame(data={
    'a': ['cat', 'bird', 'dog', np.nan],
    'b': ['red', 'green', 'blue', 'red']}
)
print(data_df)
print('-----------------------------------------------------')
smohe = SupportMissingOneHotEncoder(drop_binary=True)
smohe.fit(data_df)
print(smohe.get_feature_names_out())
encoded_data_df = smohe.transform(data_df)
print(encoded_data_df)
print(smohe.inverse_transform(encoded_data_df))
print('-----------------------------------------------------')
smohe = SupportMissingOneHotEncoder(drop_binary=True)
pipe = Pipeline([('SupportMissingOneHotEncoder', smohe)])
pipe.fit(data_df)
encoded_data_df = pipe.transform(data_df)
print(encoded_data_df)
print(pipe.inverse_transform(encoded_data_df))

-----------------------------------------------------
      a      b
0   cat    red
1  bird  green
2   dog   blue
3   NaN    red
-----------------------------------------------------
['a_bird', 'a_cat', 'a_dog', 'b_blue', 'b_green', 'b_red']
   a_bird  a_cat  a_dog  b_blue  b_green  b_red
0     0.0    1.0    0.0     0.0      0.0    1.0
1     1.0    0.0    0.0     0.0      1.0    0.0
2     0.0    0.0    1.0     1.0      0.0    0.0
3     NaN    NaN    NaN     0.0      0.0    1.0
      a      b
0   cat    red
1  bird  green
2   dog   blue
3  None    red
-----------------------------------------------------
   a_bird  a_cat  a_dog  b_blue  b_green  b_red
0     0.0    1.0    0.0     0.0      0.0    1.0
1     1.0    0.0    0.0     0.0      1.0    0.0
2     0.0    0.0    1.0     1.0      0.0    0.0
3     NaN    NaN    NaN     0.0      0.0    1.0
      a      b
0   cat    red
1  bird  green
2   dog   blue
3  None    red


## SupportMissingCategoricalEncoder

In [3]:
print('-----------------------------------------------------')
data_df = pd.DataFrame(data={
    'num_1': [1.0, 2.0, 3.0, 4.0],
    'num_2': [1.0, 2.0, 3.0, np.nan],
    'nor_1': ['red', 'green', 'blue', 'yello'],
    'nor_2': ['cat', 'bird', 'dog', np.nan],
    'ord_1': ['1_low', '2_medium', '3_hight', np.nan],
    'ord_2': ['1_low', '2_medium', '3_hight', '3_hight']
})
print(data_df)
print('-----------------------------------------------------')
smce = SupportMissingCategoricalEncoder(
    numerical_cols=['num_1', 'num_2'],
    norminal_cols=['nor_1', 'nor_2'],
    ordinal_cols=['ord_1', 'ord_2'],
    drop_binary=True,
    oe_unknown_value=-1,
    oe_missing_value=-1,
)
smce.fit(data_df)
print(smce.get_feature_names_out())
encoded_data_df = smce.transform(data_df)
print(encoded_data_df)
print(smce.inverse_transform(encoded_data_df))
print('-----------------------------------------------------')
smce = SupportMissingCategoricalEncoder(
    numerical_cols=['num_1', 'num_2'],
    norminal_cols=['nor_1', 'nor_2'],
    ordinal_cols=['ord_1', 'ord_2'],
    drop_binary=True,
    oe_unknown_value=-1,
    oe_missing_value=-1,
)
pipe = Pipeline([('SupportMissingOneHotEncoder', smce)])
pipe.fit(data_df)
encoded_data_df = pipe.transform(data_df)
print(encoded_data_df)
print(pipe.inverse_transform(encoded_data_df))

-----------------------------------------------------
   num_1  num_2  nor_1 nor_2     ord_1     ord_2
0    1.0    1.0    red   cat     1_low     1_low
1    2.0    2.0  green  bird  2_medium  2_medium
2    3.0    3.0   blue   dog   3_hight   3_hight
3    4.0    NaN  yello   NaN       NaN   3_hight
-----------------------------------------------------
['num_1', 'num_2', 'nor_1_blue', 'nor_1_green', 'nor_1_red', 'nor_1_yello', 'nor_2_bird', 'nor_2_cat', 'nor_2_dog', 'ord_1', 'ord_2']
   num_1  num_2  nor_1_blue  nor_1_green  nor_1_red  nor_1_yello  nor_2_bird  \
0    1.0    1.0         0.0          0.0        1.0          0.0         0.0   
1    2.0    2.0         0.0          1.0        0.0          0.0         1.0   
2    3.0    3.0         1.0          0.0        0.0          0.0         0.0   
3    4.0    NaN         0.0          0.0        0.0          1.0         NaN   

   nor_2_cat  nor_2_dog  ord_1  ord_2  
0        1.0        0.0    0.0    0.0  
1        0.0        0.0    1.0  

## SupportMissingDatasetEncoder

In [4]:
print('-----------------------------------------------------')
data_df = pd.DataFrame(data={
    'num_1': [1.0, 2.0, 3.0, 4.0],
    'num_2': [1.0, 2.0, 3.0, np.nan],
    'nor_1': ['red', 'green', 'blue', 'yellow'],
    'nor_2': ['cat', 'bird', 'dog', np.nan],
    'ord_1': ['1_low', '2_medium', '3_hight', np.nan],
    'ord_2': ['1_low', '2_medium', '3_hight', '3_hight'],
    'y': ['y', 'n', 'y', 'n']
})
print(data_df)
print('-----------------------------------------------------')
smde = SupportMissingDatasetEncoder(
    numerical_cols=['num_1', 'num_2'],
    norminal_cols=['nor_1', 'nor_2'],
    ordinal_cols=['ord_1', 'ord_2'],
    target_col='y',
    drop_binary=True,
    oe_unknown_value=-1,
    oe_missing_value=-1,
    encode_target=True
)

smde.fit(data_df)
smde.transform(data_df)


-----------------------------------------------------
   num_1  num_2   nor_1 nor_2     ord_1     ord_2  y
0    1.0    1.0     red   cat     1_low     1_low  y
1    2.0    2.0   green  bird  2_medium  2_medium  n
2    3.0    3.0    blue   dog   3_hight   3_hight  y
3    4.0    NaN  yellow   NaN       NaN   3_hight  n
-----------------------------------------------------


Unnamed: 0,num_1,num_2,nor_1_blue,nor_1_green,nor_1_red,nor_1_yellow,nor_2_bird,nor_2_cat,nor_2_dog,ord_1,ord_2,y
0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1
1,2.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0
2,3.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,1
3,4.0,,0.0,0.0,0.0,1.0,,,,-1.0,2.0,0
