### Dummy Encoding

In [1]:
import numpy as np
import pandas as pd

In [7]:
from sklearn.preprocessing import binarize, OneHotEncoder, label_binarize, LabelEncoder, CategoricalEncoder, LabelBinarizer

In [26]:
df = pd.DataFrame({'car': ['BMW', 'Audi', 'Mercedes', 'Fiat', 'Toyota', 'Fiat', 'Audi']})

In [51]:
df['label'] = [100, 200, 150, 50, 100, 60, 300]

In [52]:
df

Unnamed: 0,car,label
0,BMW,100
1,Audi,200
2,Mercedes,150
3,Fiat,50
4,Toyota,100
5,Fiat,60
6,Audi,300


In [8]:
lb = LabelBinarizer()

In [9]:
lb.fit_transform(df)

array([[0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1]])

In [24]:
import category_encoders as ce

## Ordinal Encoder
Transforms each category into a number. For Decison Trees okaysih, but for linear regression models not so good.
Implies an order which may not actually exist.

In [55]:
encoder = ce.OrdinalEncoder()
df.join(encoder.fit_transform(df[['car']]), rsuffix='_encode')

Unnamed: 0,car,label,car_encode
0,BMW,100,0
1,Audi,200,1
2,Mercedes,150,2
3,Fiat,50,3
4,Toyota,100,4
5,Fiat,60,3
6,Audi,300,1


In [56]:
encoder = ce.OneHotEncoder()
df.join(encoder.fit_transform(df[['car']]), rsuffix='_encode')

Unnamed: 0,car,label,car_0,car_1,car_2,car_3,car_4,car_-1
0,BMW,100,1,0,0,0,0,0
1,Audi,200,0,1,0,0,0,0
2,Mercedes,150,0,0,1,0,0,0
3,Fiat,50,0,0,0,1,0,0
4,Toyota,100,0,0,0,0,1,0
5,Fiat,60,0,0,0,1,0,0
6,Audi,300,0,1,0,0,0,0


## Binary Encoder
Transforms each category into a number (ordinal). Afterwards it translates into its binary representation.

In [57]:
encoder = ce.BinaryEncoder()
df.join(encoder.fit_transform(df[['car']]), rsuffix='_encode')

Unnamed: 0,car,label,car_0,car_1,car_2
0,BMW,100,0,0,0
1,Audi,200,0,0,1
2,Mercedes,150,0,1,0
3,Fiat,50,0,1,1
4,Toyota,100,1,0,0
5,Fiat,60,0,1,1
6,Audi,300,0,0,1


In [41]:
print(bin(3)[2:])
print(bin(5)[2:])

11
101


Binary: first the categories are encoded as ordinal, then those integers are converted into binary code, then the digits from that binary string are split into separate columns.  This encodes the data in fewer dimensions that one-hot, but with some distortion of the distances.

Using binary numbers instead of one-hot vectors introduces dependencies between the different classes.

OneHot: N -> N cols  
Binary: N -> ceiling($log_2(N))$

$2^n = zahl$

In [46]:
import math

In [49]:
print(bin(100)[2:])

1100100


In [48]:
math.ceil(math.log(100,2))

7

## BackwardDifferenceEncoder
In this coding system, the mean of the dependent variable for 
one level of the categorical  variable is compared to the  mean 
of the dependent variable for the prior adjacent leve
l. 

Backward Difference: the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level. This type of coding may be useful for a nominal or an ordinal variable.

In [59]:
encoder = ce.BackwardDifferenceEncoder()
df.join(encoder.fit_transform(df[['car']], df.label), rsuffix='_encode')

Unnamed: 0,car,label,col_car_0,col_car_1,col_car_2,col_car_3,col_car_4
0,BMW,100,1.0,-0.8,-0.6,-0.4,-0.2
1,Audi,200,1.0,0.2,-0.6,-0.4,-0.2
2,Mercedes,150,1.0,0.2,0.4,-0.4,-0.2
3,Fiat,50,1.0,0.2,0.4,0.6,-0.2
4,Toyota,100,1.0,0.2,0.4,0.6,0.8
5,Fiat,60,1.0,0.2,0.4,0.6,-0.2
6,Audi,300,1.0,0.2,-0.6,-0.4,-0.2


Unnamed: 0,car_0,car_1,car_2,car_3,car_4,car_-1
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,1,0,0,0
3,0,0,0,1,0,0
4,1,0,0,0,0,0
5,0,0,0,0,1,0


In [61]:
encoder = ce.HashingEncoder()
df.join(encoder.fit_transform(df[['car']], df.label), rsuffix='_encode')

Unnamed: 0,car,label,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,BMW,100,1,0,0,0,0,0,0,0
1,Audi,200,0,1,0,0,0,0,0,0
2,Mercedes,150,0,0,0,1,0,0,0,0
3,Fiat,50,0,0,0,0,0,1,0,0
4,Toyota,100,0,1,0,0,0,0,0,0
5,Fiat,60,0,0,0,0,0,1,0,0
6,Audi,300,0,1,0,0,0,0,0,0


In [62]:
encoder = ce.HelmertEncoder()
df.join(encoder.fit_transform(df[['car']], df.label), rsuffix='_encode')

Unnamed: 0,car,label,col_car_0,col_car_1,col_car_2,col_car_3,col_car_4
0,BMW,100,1.0,-1.0,-1.0,-1.0,-1.0
1,Audi,200,1.0,1.0,-1.0,-1.0,-1.0
2,Mercedes,150,1.0,0.0,2.0,-1.0,-1.0
3,Fiat,50,1.0,0.0,0.0,3.0,-1.0
4,Toyota,100,1.0,0.0,0.0,0.0,4.0
5,Fiat,60,1.0,0.0,0.0,3.0,-1.0
6,Audi,300,1.0,1.0,-1.0,-1.0,-1.0


Unnamed: 0,car
0,0
1,1
2,2
3,3
4,0
5,4


In [63]:
encoder = ce.SumEncoder()
df.join(encoder.fit_transform(df[['car']], df.label), rsuffix='_encode')

Unnamed: 0,car,label,col_car_0,col_car_1,col_car_2,col_car_3,col_car_4
0,BMW,100,1.0,1.0,0.0,0.0,0.0
1,Audi,200,1.0,0.0,1.0,0.0,0.0
2,Mercedes,150,1.0,0.0,0.0,1.0,0.0
3,Fiat,50,1.0,0.0,0.0,0.0,1.0
4,Toyota,100,1.0,-1.0,-1.0,-1.0,-1.0
5,Fiat,60,1.0,0.0,0.0,0.0,1.0
6,Audi,300,1.0,0.0,1.0,0.0,0.0


In [68]:
from patsy.contrasts import Diff
from patsy.DesignMatrix import 

In [67]:
Diff().code_without_intercept(df['car'])

ContrastMatrix(array([[-0.85714286, -0.71428571, -0.57142857, -0.42857143, -0.28571429,
                       -0.14285714],
                      [ 0.14285714, -0.71428571, -0.57142857, -0.42857143, -0.28571429,
                       -0.14285714],
                      [ 0.14285714,  0.28571429, -0.57142857, -0.42857143, -0.28571429,
                       -0.14285714],
                      [ 0.14285714,  0.28571429,  0.42857143, -0.42857143, -0.28571429,
                       -0.14285714],
                      [ 0.14285714,  0.28571429,  0.42857143,  0.57142857, -0.28571429,
                       -0.14285714],
                      [ 0.14285714,  0.28571429,  0.42857143,  0.57142857,  0.71428571,
                       -0.14285714],
                      [ 0.14285714,  0.28571429,  0.42857143,  0.57142857,  0.71428571,
                        0.85714286]]),
               ['[D.BMW]',
                '[D.Audi]',
                '[D.Mercedes]',
                '[D.Fiat]',
        

In [69]:
from patsy import dmatrices, dmatrix, demo_data

In [75]:
y, X = dmatrices("label ~ car", data=df, return_type='dataframe')

In [77]:
X.join(y)

Unnamed: 0,Intercept,car[T.BMW],car[T.Fiat],car[T.Mercedes],car[T.Toyota],label
0,1.0,1.0,0.0,0.0,0.0,100.0
1,1.0,0.0,0.0,0.0,0.0,200.0
2,1.0,0.0,0.0,1.0,0.0,150.0
3,1.0,0.0,1.0,0.0,0.0,50.0
4,1.0,0.0,0.0,0.0,1.0,100.0
5,1.0,0.0,1.0,0.0,0.0,60.0
6,1.0,0.0,0.0,0.0,0.0,300.0


In [87]:
y, X = dmatrices("label ~ C(car, Diff)-1", data=df, return_type='dataframe')
X.join(y).join(df, lsuffix='_')

Unnamed: 0,"C(car, Diff)[D.Audi]","C(car, Diff)[D.BMW]","C(car, Diff)[D.Fiat]","C(car, Diff)[D.Mercedes]","C(car, Diff)[D.Toyota]",label_,car,label
0,1.0,0.2,-0.6,-0.4,-0.2,100.0,BMW,100
1,1.0,-0.8,-0.6,-0.4,-0.2,200.0,Audi,200
2,1.0,0.2,0.4,0.6,-0.2,150.0,Mercedes,150
3,1.0,0.2,0.4,-0.4,-0.2,50.0,Fiat,50
4,1.0,0.2,0.4,0.6,0.8,100.0,Toyota,100
5,1.0,0.2,0.4,-0.4,-0.2,60.0,Fiat,60
6,1.0,-0.8,-0.6,-0.4,-0.2,300.0,Audi,300


In [81]:
df.groupby('car').mean()

Unnamed: 0_level_0,label
car,Unnamed: 1_level_1
Audi,250
BMW,100
Fiat,55
Mercedes,150
Toyota,100


In [88]:
0.2 * 250 + 100 - 0.6 * 55 - 0.4 * 150  - 0.2 * 100

37.0