In [None]:
# https://practicaldatascience.co.uk/machine-learning/how-to-use-category-encoders-to-transform-categorical-variables

In [1]:
project_name = '5. Used Car'
base_dir = 'D:/Projects/Prediction/Techniques Practice/Trees/XGBoost'
temp_dir = base_dir + '/' + project_name + '/' + 'temp data' + '/'
temp_dir

'D:/Projects/Prediction/Techniques Practice/Trees/XGBoost/5. Used Car/temp data/'

In [2]:
import pandas as pd
used_car_df = pd.read_csv(temp_dir + 'used_car_df.csv')

In [3]:
used_car_df.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize', 'make'],
      dtype='object')

In [5]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import category_encoders as ce
import time
import warnings
warnings.filterwarnings('ignore')

numeric_features = used_car_df.select_dtypes([np.number]).drop(['price'], axis=1).columns
numeric_features

categorical_features = used_car_df.select_dtypes(exclude=[np.number]).columns
categorical_features

X = used_car_df.drop('price', axis=1)
y = used_car_df['price']

#y = preprocessing.LabelEncoder().fit_transform(y)
y = np.log(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

selected_model = XGBRegressor(tree_method = "gpu_hist",single_precision_histogram=True, gpu_id=0)

encoders = {
    'BackwardDifferenceEncoder': ce.backward_difference.BackwardDifferenceEncoder,
    'BaseNEncoder': ce.basen.BaseNEncoder,
    'BinaryEncoder': ce.binary.BinaryEncoder,
    'CatBoostEncoder': ce.cat_boost.CatBoostEncoder,
    #'HashingEncoder': ce.hashing.HashingEncoder, takes too long
    'HelmertEncoder': ce.helmert.HelmertEncoder,
    'JamesSteinEncoder': ce.james_stein.JamesSteinEncoder,
    'OneHotEncoder': ce.one_hot.OneHotEncoder,
    'LeaveOneOutEncoder': ce.leave_one_out.LeaveOneOutEncoder,
    'MEstimateEncoder': ce.m_estimate.MEstimateEncoder,
    'OrdinalEncoder': ce.ordinal.OrdinalEncoder,
    'PolynomialEncoder': ce.polynomial.PolynomialEncoder,
    'SumEncoder': ce.sum_coding.SumEncoder,
    'TargetEncoder': ce.target_encoder.TargetEncoder,
    # 'WOEEncoder': ce.woe.WOEEncoder target must be binary
}

used_car_df_results = pd.DataFrame(columns=['encoder', 'rmse', 'r2'])


for key in encoders:
    
    time_0 = time.time()
    categorical_transformer = Pipeline(
        steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', encoders[key]())
        ]
    )    

    numeric_transformer = Pipeline(
        steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('numerical', numeric_transformer, numeric_features),
            ('categorical', categorical_transformer, categorical_features)
        ]
    )

    pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', selected_model)
        ]
    )

    model = pipe.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    row = {
        'encoder': key,
        'rmse': np.sqrt(mse(y_test, y_pred)),
        'r2': r2_score(y_test, y_pred),
    }

    used_car_df_results = used_car_df_results.append(row, ignore_index=True)
    print(key, 'time taken:', time.time()-time_0)
    print()

BackwardDifferenceEncoder time taken: 2.8630101680755615

BaseNEncoder time taken: 0.9177756309509277

BinaryEncoder time taken: 0.9544925689697266

CatBoostEncoder time taken: 1.0588061809539795

HelmertEncoder time taken: 3.3029706478118896

JamesSteinEncoder time taken: 1.01509690284729

OneHotEncoder time taken: 2.917640447616577

LeaveOneOutEncoder time taken: 1.0313606262207031

MEstimateEncoder time taken: 0.8729925155639648

OrdinalEncoder time taken: 0.7889516353607178

PolynomialEncoder time taken: 2.4049313068389893

SumEncoder time taken: 2.8621363639831543

TargetEncoder time taken: 1.0850353240966797



In [7]:
used_car_df_results.head(20).sort_values(by='rmse')


Unnamed: 0,encoder,rmse,r2
5,JamesSteinEncoder,0.111592,0.9558
9,OrdinalEncoder,0.111716,0.955702
8,MEstimateEncoder,0.112808,0.954832
12,TargetEncoder,0.113576,0.954215
0,BackwardDifferenceEncoder,0.113933,0.953927
1,BaseNEncoder,0.113944,0.953918
2,BinaryEncoder,0.113944,0.953918
7,LeaveOneOutEncoder,0.115035,0.953031
4,HelmertEncoder,0.122353,0.946865
10,PolynomialEncoder,0.123543,0.945827


In [None]:
# really interesting.  
