# Comparison of Different Categorical Encoding Methods

In [141]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, hamming_loss
import category_encoders as ce
from patsy import dmatrices, dmatrix, demo_data
from pandas_pipeline_classes import feature_engineering as fe, helper, utils, preprocessing as ppc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

### Read Data

In [12]:
df = pd.read_csv('../Solutions/data/car.data', header=None)
df.columns = ['buying','maint','doors','persons','lug_boot','safety','class']
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
class       1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [17]:
pd.Series(df.values.ravel()).unique()

array(['vhigh', '2', 'small', 'low', 'unacc', 'med', 'high', 'big', '4',
       'more', '3', '5more', 'acc', 'vgood', 'good'], dtype=object)

### Encoding Function

In [26]:
encoder_dict = {'BD_E': ce.BackwardDifferenceEncoder(),
 'B_E': ce.BinaryEncoder(),
 'H_E': ce.HashingEncoder(),
 'HEL_E': ce.HelmertEncoder(),
 'OH_E': ce.OneHotEncoder(),
 'O_E': ce.OrdinalEncoder(),
 'S_E': ce.SumEncoder(),
 'P_E': ce.PolynomialEncoder(),
 'BN_E': ce.BaseNEncoder(),
 'T_E': ce.TargetEncoder(),
 'L_E': ce.LeaveOneOutEncoder()}

### Split Train and Test

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['class'], axis=1), df['class'], test_size=0.2, random_state=42)

In [32]:
le = LabelEncoder()

In [34]:
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

### Pipeline

In [143]:
class EncoderComparison():
    
    def __init__(self,X_train, y_train, X_test, y_test,
                 classifier, encoder, scale=True):
        self.clf = classifier
        self.ec = encoder
        self.scale = scale
        
        self.X_train = X_train
        self.y_train = y_train
        
        self.X_test = X_test
        self.y_test = y_test
        
        self.pipe = None
        self.encoder = None
        self.label_encoder = LabelEncoder()
        
        self.df_transformed = None
        self.scores = {}
        
        self.clf_dict = {
            'rf': RandomForestClassifier(),
            'knn': KNeighborsClassifier(),
            'tree': DecisionTreeClassifier(),
            'svc': SVC(),
            'lr': LogisticRegression()
        }
        
        self.encoder_dict = {
            'BD_E': ce.BackwardDifferenceEncoder(),
            'B_E': ce.BinaryEncoder(),
            'H_E': ce.HashingEncoder(),
            'HEL_E': ce.HelmertEncoder(),
            'OH_E': ce.OneHotEncoder(),
            'O_E': ce.OrdinalEncoder(),
            'S_E': ce.SumEncoder(),
            'P_E': ce.PolynomialEncoder(),
            'BN_E': ce.BaseNEncoder(),
            'T_E': ce.TargetEncoder(),
            'L_E': ce.LeaveOneOutEncoder()
        }
        
    def __create_pipe__(self):
        if self.scale==True:
            self.pipe = Pipeline([
                ('DummeEncode', self.encoder_dict[self.ec]),
                ('Scaler', StandardScaler()),
                ('clf', self.clf_dict[self.clf])
            ])
        else:
            self.pipe = Pipeline([
                ('DummeEncode', self.encoder_dict[self.ec]),
                ('Scaler', StandardScaler()),
                ('clf', self.clf_dict[self.clf])
            ])
    
    def __computeEncodedDF__(self):
        ec_pipe = Pipeline(self.pipe.steps[:-2])
        df_X = pd.DataFrame(ec_pipe.transform(X_train))
        df_y = self.label_encoder.transform(y_train)
        df = df_X.join(y_train)
        df['label_encoded'] = df_y
        return df
    
    def compute_scores(self):
        self.__create_pipe__()
        y_train_encoded = self.label_encoder.fit_transform(y_train)
        y_test_encoded = self.label_encoder.fit_transform(y_test)
        
        self.pipe.fit(X_train, y_train_encoded)
        
        y_pred = self.pipe.predict(X_test)
        
        hamming = hamming_loss(y_test_encoded, y_pred)
        acc = accuracy_score(y_test_encoded, y_pred)
        
        self.scores['hamming'] = hamming
        self.scores['acc'] = acc
        return self.scores

In [121]:
encoder_dict = {'BD_E': ce.BackwardDifferenceEncoder(),
 'B_E': ce.BinaryEncoder(),
 'H_E': ce.HashingEncoder(),
 'HEL_E': ce.HelmertEncoder(),
 'OH_E': ce.OneHotEncoder(),
 'O_E': ce.OrdinalEncoder(),
 'S_E': ce.SumEncoder(),
 'P_E': ce.PolynomialEncoder(),
 'BN_E': ce.BaseNEncoder(),
 'T_E': ce.TargetEncoder(),
# 'L_E': ce.LeaveOneOutEncoder()
               }

In [144]:
clf_dict = {
            'rf': RandomForestClassifier(),
            'knn': KNeighborsClassifier(),
            'tree': DecisionTreeClassifier()
        }

clf_dict = {
            'rf': RandomForestClassifier(),
            'knn': KNeighborsClassifier(),
            'tree': DecisionTreeClassifier(),
            'svc': SVC(),
            'lr': LogisticRegression()
        }

In [145]:
def computeResults(clf, scale=True):
    idx = []
    cols = [clf+'hamming', clf+'acc']
    hamming = []
    acc = []
    for key in encoder_dict:
        idx.append(key)
        encoderComp = EncoderComparison(X_train, y_train, X_test, y_test, clf, key, scale)
        result = encoderComp.compute_scores()
        hamming.append(result['hamming'])
        acc.append(result['acc'])
    df_result = pd.DataFrame({clf + '_hamming': hamming, clf + '_acc': acc})
    df_result.index = idx
    return df_result

In [146]:
final_df = pd.DataFrame()
for element in clf_dict:
    df_temp = computeResults(element)
    final_df = pd.concat([final_df, df_temp], axis=1)



In [147]:
final_df

Unnamed: 0,rf_acc,rf_hamming,knn_acc,knn_hamming,tree_acc,tree_hamming,svc_acc,svc_hamming,lr_acc,lr_hamming
BD_E,0.890173,0.109827,0.913295,0.086705,0.965318,0.034682,0.959538,0.040462,0.872832,0.127168
B_E,0.872832,0.127168,0.806358,0.193642,0.921965,0.078035,0.913295,0.086705,0.783237,0.216763
H_E,0.676301,0.323699,0.65896,0.34104,0.676301,0.323699,0.676301,0.323699,0.679191,0.320809
HEL_E,0.942197,0.057803,0.815029,0.184971,0.953757,0.046243,0.962428,0.037572,0.875723,0.124277
OH_E,0.936416,0.063584,0.83526,0.16474,0.959538,0.040462,0.959538,0.040462,0.875723,0.124277
O_E,0.959538,0.040462,0.919075,0.080925,0.950867,0.049133,0.933526,0.066474,0.745665,0.254335
S_E,0.942197,0.057803,0.930636,0.069364,0.962428,0.037572,0.945087,0.054913,0.872832,0.127168
P_E,0.950867,0.049133,0.809249,0.190751,0.968208,0.031792,0.962428,0.037572,0.875723,0.124277
BN_E,0.820809,0.179191,0.806358,0.193642,0.910405,0.089595,0.913295,0.086705,0.783237,0.216763
T_E,0.910405,0.089595,0.748555,0.251445,0.947977,0.052023,0.751445,0.248555,0.728324,0.271676


In [150]:
final_df.sort_values('rf_hamming')

Unnamed: 0,rf_acc,rf_hamming,knn_acc,knn_hamming,tree_acc,tree_hamming,svc_acc,svc_hamming,lr_acc,lr_hamming
S_E,0.953757,0.046243,0.930636,0.069364,0.962428,0.037572,0.945087,0.054913,0.872832,0.127168
HEL_E,0.933526,0.066474,0.815029,0.184971,0.956647,0.043353,0.962428,0.037572,0.875723,0.124277
BD_E,0.930636,0.069364,0.913295,0.086705,0.959538,0.040462,0.959538,0.040462,0.872832,0.127168
OH_E,0.930636,0.069364,0.83526,0.16474,0.962428,0.037572,0.959538,0.040462,0.875723,0.124277
P_E,0.930636,0.069364,0.809249,0.190751,0.968208,0.031792,0.962428,0.037572,0.875723,0.124277
T_E,0.930636,0.069364,0.748555,0.251445,0.947977,0.052023,0.751445,0.248555,0.728324,0.271676
O_E,0.921965,0.078035,0.919075,0.080925,0.962428,0.037572,0.933526,0.066474,0.745665,0.254335
B_E,0.852601,0.147399,0.806358,0.193642,0.913295,0.086705,0.913295,0.086705,0.783237,0.216763
BN_E,0.843931,0.156069,0.806358,0.193642,0.907514,0.092486,0.913295,0.086705,0.783237,0.216763
H_E,0.676301,0.323699,0.65896,0.34104,0.676301,0.323699,0.676301,0.323699,0.679191,0.320809


In [151]:
final_df.sort_values('knn_hamming')

Unnamed: 0,rf_acc,rf_hamming,knn_acc,knn_hamming,tree_acc,tree_hamming,svc_acc,svc_hamming,lr_acc,lr_hamming
S_E,0.953757,0.046243,0.930636,0.069364,0.962428,0.037572,0.945087,0.054913,0.872832,0.127168
O_E,0.921965,0.078035,0.919075,0.080925,0.962428,0.037572,0.933526,0.066474,0.745665,0.254335
BD_E,0.930636,0.069364,0.913295,0.086705,0.959538,0.040462,0.959538,0.040462,0.872832,0.127168
OH_E,0.930636,0.069364,0.83526,0.16474,0.962428,0.037572,0.959538,0.040462,0.875723,0.124277
HEL_E,0.933526,0.066474,0.815029,0.184971,0.956647,0.043353,0.962428,0.037572,0.875723,0.124277
P_E,0.930636,0.069364,0.809249,0.190751,0.968208,0.031792,0.962428,0.037572,0.875723,0.124277
B_E,0.852601,0.147399,0.806358,0.193642,0.913295,0.086705,0.913295,0.086705,0.783237,0.216763
BN_E,0.843931,0.156069,0.806358,0.193642,0.907514,0.092486,0.913295,0.086705,0.783237,0.216763
T_E,0.930636,0.069364,0.748555,0.251445,0.947977,0.052023,0.751445,0.248555,0.728324,0.271676
H_E,0.676301,0.323699,0.65896,0.34104,0.676301,0.323699,0.676301,0.323699,0.679191,0.320809


In [152]:
final_df.sort_values('tree_hamming')

Unnamed: 0,rf_acc,rf_hamming,knn_acc,knn_hamming,tree_acc,tree_hamming,svc_acc,svc_hamming,lr_acc,lr_hamming
P_E,0.930636,0.069364,0.809249,0.190751,0.968208,0.031792,0.962428,0.037572,0.875723,0.124277
OH_E,0.930636,0.069364,0.83526,0.16474,0.962428,0.037572,0.959538,0.040462,0.875723,0.124277
O_E,0.921965,0.078035,0.919075,0.080925,0.962428,0.037572,0.933526,0.066474,0.745665,0.254335
S_E,0.953757,0.046243,0.930636,0.069364,0.962428,0.037572,0.945087,0.054913,0.872832,0.127168
BD_E,0.930636,0.069364,0.913295,0.086705,0.959538,0.040462,0.959538,0.040462,0.872832,0.127168
HEL_E,0.933526,0.066474,0.815029,0.184971,0.956647,0.043353,0.962428,0.037572,0.875723,0.124277
T_E,0.930636,0.069364,0.748555,0.251445,0.947977,0.052023,0.751445,0.248555,0.728324,0.271676
B_E,0.852601,0.147399,0.806358,0.193642,0.913295,0.086705,0.913295,0.086705,0.783237,0.216763
BN_E,0.843931,0.156069,0.806358,0.193642,0.907514,0.092486,0.913295,0.086705,0.783237,0.216763
H_E,0.676301,0.323699,0.65896,0.34104,0.676301,0.323699,0.676301,0.323699,0.679191,0.320809


In [153]:
final_df.sort_values('svc_hamming')

Unnamed: 0,rf_acc,rf_hamming,knn_acc,knn_hamming,tree_acc,tree_hamming,svc_acc,svc_hamming,lr_acc,lr_hamming
HEL_E,0.933526,0.066474,0.815029,0.184971,0.956647,0.043353,0.962428,0.037572,0.875723,0.124277
P_E,0.930636,0.069364,0.809249,0.190751,0.968208,0.031792,0.962428,0.037572,0.875723,0.124277
BD_E,0.930636,0.069364,0.913295,0.086705,0.959538,0.040462,0.959538,0.040462,0.872832,0.127168
OH_E,0.930636,0.069364,0.83526,0.16474,0.962428,0.037572,0.959538,0.040462,0.875723,0.124277
S_E,0.953757,0.046243,0.930636,0.069364,0.962428,0.037572,0.945087,0.054913,0.872832,0.127168
O_E,0.921965,0.078035,0.919075,0.080925,0.962428,0.037572,0.933526,0.066474,0.745665,0.254335
B_E,0.852601,0.147399,0.806358,0.193642,0.913295,0.086705,0.913295,0.086705,0.783237,0.216763
BN_E,0.843931,0.156069,0.806358,0.193642,0.907514,0.092486,0.913295,0.086705,0.783237,0.216763
T_E,0.930636,0.069364,0.748555,0.251445,0.947977,0.052023,0.751445,0.248555,0.728324,0.271676
H_E,0.676301,0.323699,0.65896,0.34104,0.676301,0.323699,0.676301,0.323699,0.679191,0.320809


In [154]:
final_df.sort_values('lr_hamming')

Unnamed: 0,rf_acc,rf_hamming,knn_acc,knn_hamming,tree_acc,tree_hamming,svc_acc,svc_hamming,lr_acc,lr_hamming
HEL_E,0.933526,0.066474,0.815029,0.184971,0.956647,0.043353,0.962428,0.037572,0.875723,0.124277
OH_E,0.930636,0.069364,0.83526,0.16474,0.962428,0.037572,0.959538,0.040462,0.875723,0.124277
P_E,0.930636,0.069364,0.809249,0.190751,0.968208,0.031792,0.962428,0.037572,0.875723,0.124277
BD_E,0.930636,0.069364,0.913295,0.086705,0.959538,0.040462,0.959538,0.040462,0.872832,0.127168
S_E,0.953757,0.046243,0.930636,0.069364,0.962428,0.037572,0.945087,0.054913,0.872832,0.127168
B_E,0.852601,0.147399,0.806358,0.193642,0.913295,0.086705,0.913295,0.086705,0.783237,0.216763
BN_E,0.843931,0.156069,0.806358,0.193642,0.907514,0.092486,0.913295,0.086705,0.783237,0.216763
O_E,0.921965,0.078035,0.919075,0.080925,0.962428,0.037572,0.933526,0.066474,0.745665,0.254335
T_E,0.930636,0.069364,0.748555,0.251445,0.947977,0.052023,0.751445,0.248555,0.728324,0.271676
H_E,0.676301,0.323699,0.65896,0.34104,0.676301,0.323699,0.676301,0.323699,0.679191,0.320809


In [148]:
final_df = pd.DataFrame()
for element in clf_dict:
    df_temp = computeResults(element, scale=False)
    final_df = pd.concat([final_df, df_temp], axis=1)



In [149]:
final_df

Unnamed: 0,rf_acc,rf_hamming,knn_acc,knn_hamming,tree_acc,tree_hamming,svc_acc,svc_hamming,lr_acc,lr_hamming
BD_E,0.930636,0.069364,0.913295,0.086705,0.959538,0.040462,0.959538,0.040462,0.872832,0.127168
B_E,0.852601,0.147399,0.806358,0.193642,0.913295,0.086705,0.913295,0.086705,0.783237,0.216763
H_E,0.676301,0.323699,0.65896,0.34104,0.676301,0.323699,0.676301,0.323699,0.679191,0.320809
HEL_E,0.933526,0.066474,0.815029,0.184971,0.956647,0.043353,0.962428,0.037572,0.875723,0.124277
OH_E,0.930636,0.069364,0.83526,0.16474,0.962428,0.037572,0.959538,0.040462,0.875723,0.124277
O_E,0.921965,0.078035,0.919075,0.080925,0.962428,0.037572,0.933526,0.066474,0.745665,0.254335
S_E,0.953757,0.046243,0.930636,0.069364,0.962428,0.037572,0.945087,0.054913,0.872832,0.127168
P_E,0.930636,0.069364,0.809249,0.190751,0.968208,0.031792,0.962428,0.037572,0.875723,0.124277
BN_E,0.843931,0.156069,0.806358,0.193642,0.907514,0.092486,0.913295,0.086705,0.783237,0.216763
T_E,0.930636,0.069364,0.748555,0.251445,0.947977,0.052023,0.751445,0.248555,0.728324,0.271676
