In [1]:
import numpy as np
import pandas as pd              
import category_encoders as ce   
from sklearn.preprocessing import LabelEncoder
# round to two decimal places in python pandas 
pd.options.display.float_format = '{:.2f}'.format

In [17]:
# make some data
df = pd.DataFrame({
    'color':["a", "c", "a", "a", "b", "b"], 
    'outcome':[1, 2, 0, 0, 0, 1]})


In [18]:
df

Unnamed: 0,color,outcome
0,a,1
1,c,2
2,a,0
3,a,0
4,b,0
5,b,1


In [19]:
# set up X and y
X = df.drop('outcome', axis = 1)
y = df.drop('color', axis = 1)

## Label encoder
is used to encode class levels.returns output values starting with 0.

In [22]:
#label incoder 
le = LabelEncoder()
encoded = le.fit_transform(np.ravel(X)) # warning thrown without np.ravel(Return a contiguous flattened array.)

## df.apply(LabelEncoder().fit_transform)  # applying label endcoder to entire dataset

In [23]:
encoded

array([0, 2, 0, 0, 1, 1])

## Ordinal encoder
The first unique value in your column becomes 1, the second becomes 2, the third becomes 3, and so on.
pars: 

1.cols: list of columns to encode, if None, all string columns will be encoded.

3.mapping:a mapping of class to label to use for the encoding. should be a list of dicts where internal dicts should contain the keys 'col' and 'mapping' and in that the 'mapping' key should have a list of tuples of format (original_label, encoded_label) as value.add one more dict to add new column.

2.X, y in fit_transform: Fit encoder according to X and y

4.return_df:boolean for whether to return a pandas DataFrame from transform otherwise it will be numpy array

In [24]:
ce_ord = ce.OrdinalEncoder(cols = ['color'])
ce_ord.fit_transform(X, y['outcome'])  # x for training or fit and y to transform or encode

Unnamed: 0,color
0,1
1,2
2,1
3,1
4,3
5,3


In [25]:
## ex with mapping parameter 
ordinal_cols_mapping = [{
    "col":"ExterQual",    
    "mapping": [
        ('Ex',5), 
        ('Gd',4), 
        ('TA',3), 
        ('Fa',2), 
        ('Po',1), 
        ('NA',np.nan)
    ]}, ]

encoder = OrdinalEncoder(mapping = ordinal_cols_mapping, 
                         return_df = True)  
df_train = encoder.fit_transform(train_data)

NameError: name 'OrdinalEncoder' is not defined

## One-Hot encoder
this feature expansion can create serious memory problems if your data set has high cardinality features. One-hot-encoded data can also be difficult for decision-tree-based algorithms.

In [31]:
ce_one_hot = ce.OneHotEncoder(cols = ['color'])
ce_one_hot.fit_transform(X, y)

Unnamed: 0,color_1,color_2,color_3
0,1,0,0
1,0,1,0
2,1,0,0
3,1,0,0
4,0,0,1
5,0,0,1


## Binary encoder
Binary creates fewer features than one-hot, while preserving some uniqueness of values in the the column. It can work well with higher dimensionality ordinal data.performs well when the cardinality of the column is higher.

In [30]:
ce_bin = ce.BinaryEncoder(cols = ['color'])
ce_bin.fit_transform(X,y)

Unnamed: 0,color_0,color_1,color_2
0,0,0,1
1,0,1,0
2,0,0,1
3,0,0,1
4,0,1,1
5,0,1,1


# BaseN encoder
Base-N encoder encodes the categories into arrays of their base-N representation. A base of 1 is equivalent to one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding.You could use BaseN with gridsearchCV.
pars:
1.base=2(default)


In [32]:
ce_basen = ce.BaseNEncoder(cols = ['color'])
ce_basen.fit_transform(X, y)

Unnamed: 0,color_0,color_1,color_2
0,0,0,1
1,0,1,0
2,0,0,1
3,0,0,1
4,0,1,1
5,0,1,1


## Hashing Encoder
if you have a category with high cardinality you decide a minimum number of reduced categories (hashes) that all the categories will have to share. if two categories share the same hash or bucket, that is called a hash collision. 
HashingEncoder implements the hashing trick. It is similar to one-hot encoding but with fewer new dimensions and some info loss due to collisions.The n_components parameter controls the number of expanded columns. The default is eight columns.You can pass a hashing algorithm of your choice to HashingEncoder; the default is md5. Itâ€™s worth trying HashingEncoder for nominal and ordinal data if you have high cardinality features. 

In [33]:
ce_hash = ce.HashingEncoder(cols = ['color'])
ce_hash.fit_transform(X, y)

Unnamed: 0,color
0,a
1,c
2,a
3,a
4,b
5,b


## Bayesian Encoders
The Bayesian encoders use information from the dependent variable in their encodings. They output one column and can work well with high cardinality data.

## 1.Target encoder
uses the mean of the DV, we must take steps to avoid overfitting. Nominal, ordinal. For classification tasks.

In [36]:
# Target encoder with default parameters
ce_target = ce.TargetEncoder(cols = ['color'])

ce_target.fit(X, y['outcome'])
# Must pass the series for y in v1.2.8

ce_target.transform(X, y['outcome'])

Unnamed: 0,color
0,0.373068
1,0.666667
2,0.373068
3,0.373068
4,0.544824
5,0.544824


## 2.LeaveOneOut encoder
similar to target but avoids contamination. Nominal, ordinal. For classification tasks

In [37]:
ce_leave = ce.LeaveOneOutEncoder(cols = ['color'])
ce_leave.fit(X, y['outcome'])        
ce_leave.transform(X, y['outcome'])    

Unnamed: 0,color
0,0.0
1,0.666667
2,0.5
3,0.5
4,1.0
5,0.0
