In [2]:
import pandas as pd
clothing_data_df = pd.read_csv('./data/e-shop data and description/e-shop clothing 2008.csv',sep=',')

In [4]:
#imports 
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error as mse
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import category_encoders as ce
import time
import warnings
warnings.filterwarnings('ignore')

BackwardDifferenceEncoder time taken: 4.3300018310546875

BaseNEncoder time taken: 0.8520731925964355

BinaryEncoder time taken: 0.8530011177062988

CatBoostEncoder time taken: 0.9053285121917725

HelmertEncoder time taken: 2.986112356185913

JamesSteinEncoder time taken: 0.7440588474273682

OneHotEncoder time taken: 3.1396644115448

LeaveOneOutEncoder time taken: 1.0416851043701172

MEstimateEncoder time taken: 0.8118071556091309

OrdinalEncoder time taken: 0.839799165725708

PolynomialEncoder time taken: 2.249000072479248

SumEncoder time taken: 2.7720136642456055

TargetEncoder time taken: 0.7807760238647461



In [None]:
#identify numeric and categorical features
numeric_features = clothing_data_df.select_dtypes([np.number]).drop(['price'], axis=1).columns
numeric_features

categorical_features = clothing_data_df.select_dtypes(exclude=[np.number]).columns
categorical_features

In [None]:
#prep data
X = clothing_data_df.drop('price', axis=1)
y = clothing_data_df['price']

y = np.log(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#define encoders
encoders = {
    'BackwardDifferenceEncoder': ce.backward_difference.BackwardDifferenceEncoder,
    'BaseNEncoder': ce.basen.BaseNEncoder,
    'BinaryEncoder': ce.binary.BinaryEncoder,
    'CatBoostEncoder': ce.cat_boost.CatBoostEncoder,
    #'HashingEncoder': ce.hashing.HashingEncoder, takes too long
    'HelmertEncoder': ce.helmert.HelmertEncoder,
    'JamesSteinEncoder': ce.james_stein.JamesSteinEncoder,
    'OneHotEncoder': ce.one_hot.OneHotEncoder,
    'LeaveOneOutEncoder': ce.leave_one_out.LeaveOneOutEncoder,
    'MEstimateEncoder': ce.m_estimate.MEstimateEncoder,
    'OrdinalEncoder': ce.ordinal.OrdinalEncoder,
    'PolynomialEncoder': ce.polynomial.PolynomialEncoder,
    'SumEncoder': ce.sum_coding.SumEncoder,
    'TargetEncoder': ce.target_encoder.TargetEncoder,
    # 'WOEEncoder': ce.woe.WOEEncoder target must be binary
}

In [None]:
selected_model = XGBRegressor(tree_method = "gpu_hist",single_precision_histogram=True, gpu_id=0)
clothing_data_df_results = pd.DataFrame(columns=['encoder', 'rmse', 'r2'])

In [None]:
for key in encoders:
    
    time_0 = time.time()
    categorical_transformer = Pipeline(
        steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', encoders[key]())
        ]
    )    

    numeric_transformer = Pipeline(
        steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('numerical', numeric_transformer, numeric_features),
            ('categorical', categorical_transformer, categorical_features)
        ]
    )

    pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', selected_model)
        ]
    )

    model = pipe.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    row = {
        'encoder': key,
        'rmse': np.sqrt(mse(y_test, y_pred)),
        'r2': r2_score(y_test, y_pred),
    }

    clothing_data_df_results = clothing_data_df_results.append(row, ignore_index=True)
    #print(key, 'time taken:', time.time()-time_0)
    

In [7]:
clothing_data_df_results.head(20).sort_values(by='rmse')


Unnamed: 0,encoder,rmse,r2
0,BackwardDifferenceEncoder,0.007809,0.999273
9,OrdinalEncoder,0.008245,0.999189
1,BaseNEncoder,0.014554,0.997474
2,BinaryEncoder,0.014554,0.997474
5,JamesSteinEncoder,0.016508,0.99675
12,TargetEncoder,0.02104,0.994721
8,MEstimateEncoder,0.023136,0.993616
4,HelmertEncoder,0.032937,0.987063
6,OneHotEncoder,0.042753,0.978201
11,SumEncoder,0.042821,0.978133
