In [1]:
import os
import sys

sys.path.insert(0, os.path.abspath(
    os.path.join(os.path.dirname('__file__'), '..')))
from src.encoding.tools import create_encoding_pipeline
from src.encoding.encoders import *
import pandas as pd

from sklearn import set_config

In [2]:
data = pd.DataFrame({
    'num_feature1': [1.0, 2.0, 3.0, 4.0],
    'num_feature2': [10.0, 20.0, 30.0, 40.0],
    'cat_feature1': ['A', 'B', 'A', 'B'],
    'cat_feature2': ['X', 'Y', 'X', 'Y'],
    'date_feature': pd.to_datetime(['2021-01-01', '2021-01-27', '2021-01-03', '2021-03-26'])
})

y = pd.Series([1, 0, 1, 0])

data['num_feature1'] = data['num_feature1'].astype('float')
data['num_feature2'] = data['num_feature2'].astype('float')
data['cat_feature1'] = data['cat_feature1'].astype('category')
data['cat_feature2'] = data['cat_feature2'].astype('category')
data['date_feature'] = pd.to_datetime(data['date_feature'])

In [3]:
test_data = pd.DataFrame({
    'num_feature1': [5.0],
    'num_feature2': [50.0],
    'cat_feature1': ['A'],
    'cat_feature2': ['X'],
    'date_feature': pd.to_datetime(['2021-03-27'])
})

test_data['num_feature1'] = test_data['num_feature1'].astype('float')
test_data['num_feature2'] = test_data['num_feature2'].astype('float')
test_data['cat_feature1'] = test_data['cat_feature1'].astype('category')
test_data['cat_feature2'] = test_data['cat_feature2'].astype('category')
test_data['date_feature'] = pd.to_datetime(test_data['date_feature'])

In [4]:
encoders_dict = {
    'number': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='mean')],
            'encoders': [
                ne.StandardScaler(),
            ]
        },
    },
    'category': {
        'as_category': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
            'encoders': [
                ce.TargetEncoder(),
            ]
        }
    },
    'datetime': {
        'as_number': {
            'imputers': [de.DateFeatureExtractor()],
            'encoders': [
                ne.CyclicalFeatures(drop_original=True)
            ]
        },
        'as_category': {
            'imputers': [de.DateFeatureExtractor(dtype='category')],
            'encoders': [
                ce.TargetEncoder(),
            ]
        }
    }
}

In [5]:
processor = create_encoding_pipeline(encoders_dict)

Creating encoding pipeline


In [6]:
processor

In [7]:
processor.fit(data, y)
processor

[FeatureUnion]  (step 1 of 4) Processing columntransformer-1, total=   0.0s
[FeatureUnion]  (step 2 of 4) Processing columntransformer-2, total=   0.0s
[FeatureUnion]  (step 3 of 4) Processing columntransformer-3, total=   0.0s
[FeatureUnion]  (step 4 of 4) Processing columntransformer-4, total=   0.1s


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.

In [8]:
encoded = processor.transform(data)
print(encoded)

   pipeline__standardscaler__simpleimputer__num_feature1  \
0                                          -1.341641       
1                                          -0.447214       
2                                           0.447214       
3                                           1.341641       

   pipeline__standardscaler__simpleimputer__num_feature2  \
0                                          -1.341641       
1                                          -0.447214       
2                                           0.447214       
3                                           1.341641       

   pipeline__targetencoder__simpleimputer__cat_feature1  \
0                                           0.570926      
1                                           0.429074      
2                                           0.570926      
3                                           0.429074      

   pipeline__targetencoder__simpleimputer__cat_feature2  \
0                                          

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [9]:
encoded = processor.transform(test_data)
print(encoded)

   pipeline__standardscaler__simpleimputer__num_feature1  \
0                                           2.236068       

   pipeline__standardscaler__simpleimputer__num_feature2  \
0                                           2.236068       

   pipeline__targetencoder__simpleimputer__cat_feature1  \
0                                           0.570926      

   pipeline__targetencoder__simpleimputer__cat_feature2  \
0                                           0.570926      

   pipeline__cyclicalfeatures__date_feature_month_sin  \
0                                      -2.449294e-16    

   pipeline__cyclicalfeatures__date_feature_month_cos  \
0                                                1.0    

   pipeline__cyclicalfeatures__date_feature_day_sin  \
0                                     -2.449294e-16   

   pipeline__cyclicalfeatures__date_feature_day_cos  \
0                                               1.0   

   pipeline__cyclicalfeatures__date_feature_dayofweek_sin  \
0      

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
