In [1]:
import os
import sys

sys.path.insert(0, os.path.abspath(
    os.path.join(os.path.dirname('__file__'), '..')))
from src.encoders.tools import create_encoding_pipeline
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler
from category_encoders import TargetEncoder, CatBoostEncoder
from feature_engine.creation import CyclicalFeatures
from sklearn import set_config

In [2]:
data = pd.DataFrame({
    'num_feature1': [1.0, 2.0, 3.0, 4.0],
    'num_feature2': [10.0, 20.0, 30.0, 40.0],
    'cat_feature1': ['A', 'B', 'A', 'B'],
    'cat_feature2': ['X', 'Y', 'X', 'Y'],
    'date_feature': pd.to_datetime(['2021-01-01', '2021-01-27', '2021-01-03', '2021-03-26'])
})

y = pd.Series([1, 0, 1, 0])

data['num_feature1'] = data['num_feature1'].astype('float')
data['num_feature2'] = data['num_feature2'].astype('float')
data['cat_feature1'] = data['cat_feature1'].astype('category')
data['cat_feature2'] = data['cat_feature2'].astype('category')
data['date_feature'] = pd.to_datetime(data['date_feature'])

In [3]:
test_data = pd.DataFrame({
    'num_feature1': [5.0],
    'num_feature2': [50.0],
    'cat_feature1': ['A'],
    'cat_feature2': ['X'],
    'date_feature': pd.to_datetime(['2021-03-27'])
})

test_data['num_feature1'] = test_data['num_feature1'].astype('float')
test_data['num_feature2'] = test_data['num_feature2'].astype('float')
test_data['cat_feature1'] = test_data['cat_feature1'].astype('category')
test_data['cat_feature2'] = test_data['cat_feature2'].astype('category')
test_data['date_feature'] = pd.to_datetime(test_data['date_feature'])

In [4]:
encoders_dict = {
    'number': [
        StandardScaler(),
        # MinMaxScaler(),
        # RobustScaler()
    ],
    'category': [
        # OneHotEncoder(sparse_output=False),
        TargetEncoder(),
        # CatBoostEncoder()
    ],
    'datetime': {
        'as_number': [
            CyclicalFeatures(drop_original=True)
        ],
        'as_category': [
             TargetEncoder(),
            #  CatBoostEncoder()
        ]
    }
}

In [5]:
processor = create_encoding_pipeline(encoders_dict)

Creating encoding pipeline


In [6]:
processor

In [7]:
processor.fit(data, y)
processor

[Pipeline]  (step 1 of 3) Processing columntransformer-1, total=   0.0s
[Pipeline]  (step 2 of 3) Processing columntransformer-2, total=   0.0s
[Pipeline]  (step 3 of 3) Processing columntransformer-3, total=   0.1s


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.

In [8]:
encoded = processor.transform(data)
print(encoded)

   pipeline__featureunion__targetencoder__remainder__remainder__remainder__date_feature_month_cat  \
0                                           0.525744                                                
1                                           0.525744                                                
2                                           0.525744                                                
3                                           0.434946                                                

   pipeline__featureunion__targetencoder__remainder__remainder__remainder__date_feature_day_cat  \
0                                           0.565054                                              
1                                           0.434946                                              
2                                           0.565054                                              
3                                           0.434946                                              

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [9]:
encoded = processor.transform(test_data)
print(encoded)

   pipeline__featureunion__targetencoder__remainder__remainder__remainder__date_feature_month_cat  \
0                                           0.434946                                                

   pipeline__featureunion__targetencoder__remainder__remainder__remainder__date_feature_day_cat  \
0                                           0.434946                                              

   pipeline__featureunion__targetencoder__remainder__remainder__remainder__date_feature_dayofweek_cat  \
0                                                0.5                                                    

   pipeline__featureunion__targetencoder__remainder__remainder__remainder__date_feature_week_cat  \
0                                           0.434946                                               

   pipeline__featureunion__targetencoder__remainder__remainder__remainder__date_feature_dayofYear_cat  \
0                                                0.5                             

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
