## 필요한 모듈 설치

In [1]:
#!pip install category_encoders

In [2]:
import sklearn
import pandas as pd
import numpy as np
import category_encoders as ce
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option("display.max_columns", 30)

## 데이터 전처리

In [3]:
all_data = pd.read_csv('allstate_train.csv')

In [4]:
target = ['record_type']
con = ['group_size','car_age','age_oldest','age_youngest','duration_previous','cost']
cat = ['day','homeowner','car_value','risk_factor','married_couple','C_previous','state','shopping_pt']
data = all_data[con+cat+target]
data.head()

Unnamed: 0,group_size,car_age,age_oldest,age_youngest,duration_previous,cost,day,homeowner,car_value,risk_factor,married_couple,C_previous,state,shopping_pt,record_type
0,2,2,46,42,2.0,633,0,0,g,3.0,1,1.0,IN,1,0
1,2,2,46,42,2.0,630,0,0,g,3.0,1,1.0,IN,2,0
2,2,2,46,42,2.0,630,0,0,g,3.0,1,1.0,IN,3,0
3,2,2,46,42,2.0,630,0,0,g,3.0,1,1.0,IN,4,0
4,2,2,46,42,2.0,630,0,0,g,3.0,1,1.0,IN,5,0


In [5]:
data_drop = data.dropna(subset=['state','car_value','risk_factor','C_previous','duration_previous'])

In [6]:
data = data_drop[['car_value', 'state', 'record_type']] ; data.head(10)

Unnamed: 0,car_value,state,record_type
0,g,IN,0
1,g,IN,0
2,g,IN,0
3,g,IN,0
4,g,IN,0
5,g,IN,0
6,g,IN,0
7,g,IN,0
8,g,IN,1
9,e,NY,0


## 1. One Hot Encoding

- 각 범주형 변수의 값들을 0과 1로 mapping시킴
- 범주형 값의 종류의 개수만큼 추가적인 feature 생성
-분류문제 : n개의 카테고리 값이 존재할 때 n개 모두 인코딩 
-회귀문제 : n-1개의 카테고리 값을 인코딩 해준다. 자유도(degree of freedom)을 반영해주기 위해

<유의할 점>
- 범주형 변수 값들의 종류가 많아지면 feature의 개수가 늘어나 학습속도가 느려짐
- 해당 feature가 중요한 정보를 가질 가능성이 줄어들기 때문에 feature의 중요도가 낮아짐

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- use_cat_names: bool
- handle_unknown: str
- handle_missing: str
#### ex - (verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', handle_unknown='value', use_cat_names=False)

In [7]:
data_ohe = data.copy()
data_ohe = pd.get_dummies(data, columns=['car_value'])

In [8]:
data_ohe.filter(like='car_value').sum()

car_value_a       615
car_value_b       711
car_value_c     11896
car_value_d     69121
car_value_e    134708
car_value_f    113198
car_value_g     63359
car_value_h     20060
car_value_i      2862
dtype: int64

#### One Hot Encoding 데이터 필터링 결과

In [9]:
data_ohe.filter(like='car').head(10)

Unnamed: 0,car_value_a,car_value_b,car_value_c,car_value_d,car_value_e,car_value_f,car_value_g,car_value_h,car_value_i
0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,0,1,0,0
9,0,0,0,0,1,0,0,0,0


## 2. Label Encoding

- 범주형 변수의 n개 종류의 값들을 1에서 n값으로 숫자를 부여함
- 숫자들 사이에 관계 존재 
- 범주형 자료 중에서도 순서형 자료에 적합

<유의할 점>
- 큰 숫자에 더 큰 가중치를 부여하게 될 수 있음

- fit() : 어떻게 변환할 것인지에 대해 학습

- transform() : 문자열을 숫자로 변환

- fit_transform() : 학습과 변환을 한 번에 처리

- inverse_transform() : 숫자를 다시 문자열로 변환

- classes_ : 인코딩한 클래스 조회

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
data_le = data.copy()
data_le['car_value_encoded'] = LabelEncoder().fit_transform(data_le['car_value'])

In [12]:
data_le.filter(like='car').head(10)

Unnamed: 0,car_value,car_value_encoded
0,g,6
1,g,6
2,g,6
3,g,6
4,g,6
5,g,6
6,g,6
7,g,6
8,g,6
9,e,4


#### Label Encoding 데이터 필터링 결과

In [13]:
data_le.filter(like='car').drop_duplicates().sort_values(by='car_value_encoded').reset_index(drop = True)

Unnamed: 0,car_value,car_value_encoded
0,a,0
1,b,1
2,c,2
3,d,3
4,e,4
5,f,5
6,g,6
7,h,7
8,i,8


## 3. Ordinal Encoding

- 순서형 자료에 적합한 인코딩 방식.
- 범주형 자료값들이 순서형 자료인지를 고려함.
- 사용자가 주어진 범주형 변수에 직접 변수 값들 간의 순서를 dictionary형태로 정의
- 순서 정보를 담을 수 있음
- 직관적이지만 추가적인 코딩이 필요함

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- mapping: list of dicts
- handle_unknown: str
- handle_missing: str
#### ex - (verbose=0, mapping=None, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value')

In [14]:
data_oe = data.copy()

In [15]:
car_dict = { 'a' : 1,
             'b' : 2,
             'c' : 3,
             'd' : 4,
             'e' : 5,
             'f' : 6,
             'g' : 7,
             'h' : 8,
             'i' : 9 }

data_oe['car_value_encoded'] = data_oe.car_value.map(car_dict)
data_oe.filter(like='car')

Unnamed: 0,car_value,car_value_encoded
0,g,7
1,g,7
2,g,7
3,g,7
4,g,7
...,...,...
665232,g,7
665233,g,7
665240,f,6
665241,f,6


#### Ordinal Encoding 데이터 필터링 결과

In [16]:
data_oe.filter(like='car').drop_duplicates().sort_values(by='car_value_encoded').reset_index(drop = True)

Unnamed: 0,car_value,car_value_encoded
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5
5,f,6
6,g,7
7,h,8
8,i,9


## 4. Helmert Encoding

특정 범주형 변수에서 특정한 level의 인코딩 값을 도출하기 위해 해당 level에 매핑되는 (dependent variable, y값)의 평균값과 모든 level에 매핑되는 모든 종속변수 값들의 평균값을 비교하는 방법

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- handle_unknown: str
- handle_missing: str
#### ex - (verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value')

In [17]:
data_he = data.copy()

he_encoder = ce.HelmertEncoder(cols=['car_value'], drop_invariant=True)
data_he_tranformed = he_encoder.fit_transform(data_he['car_value'])
data_he = pd.concat([data_he, data_he_tranformed], axis = 1)
data_he.filter(like='car')

Unnamed: 0,car_value,car_value_0,car_value_1,car_value_2,car_value_3,car_value_4,car_value_5,car_value_6,car_value_7
0,g,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,g,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,g,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,g,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,g,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...
665232,g,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
665233,g,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
665240,f,0.0,0.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0
665241,f,0.0,0.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0


#### Helmert Encoding 데이터 필터링 결과

In [18]:
data_he.filter(like='car').drop_duplicates().sort_values(by='car_value').reset_index(drop = True)

Unnamed: 0,car_value,car_value_0,car_value_1,car_value_2,car_value_3,car_value_4,car_value_5,car_value_6,car_value_7
0,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
1,b,0.0,0.0,0.0,0.0,0.0,0.0,7.0,-1.0
2,c,0.0,0.0,0.0,4.0,-1.0,-1.0,-1.0,-1.0
3,d,0.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,e,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,f,0.0,0.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0
6,g,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7,h,0.0,0.0,0.0,0.0,5.0,-1.0,-1.0,-1.0
8,i,0.0,0.0,0.0,0.0,0.0,6.0,-1.0,-1.0


## 5. Binary Encoding

- 범주형 값들을 이진법으로 변환해주는 방법
- n개 종류의 feature값들이 존재할 때 binary encoding을 적용하면 log2의 n제곱만큼 feature가 추가

<One-Hot encoding과 비교>
- One-hot encoding에 비해 훨씬 더 적은 개수의 변수가 생성
(100개의 범주가 있을 때 onehot encoding은 100개의 feature를 생성하는 반면, binary encoding은 7개의 feature만 생성)
- 모델 학습시 더 빠른 속도로 학습 가능

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- handle_unknown: str
- handle_missing: str
#### ex - (verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value')

In [19]:
data_be = data.copy()

be_encoder = ce.BinaryEncoder(cols=['car_value'], drop_invariant=True)
data_be_tranformed = be_encoder.fit_transform(data_be['car_value'])
data_be = pd.concat([data_be, data_be_tranformed], axis = 1)
data_be.filter(like='car')

Unnamed: 0,car_value,car_value_0,car_value_1,car_value_2,car_value_3
0,g,0,0,0,1
1,g,0,0,0,1
2,g,0,0,0,1
3,g,0,0,0,1
4,g,0,0,0,1
...,...,...,...,...,...
665232,g,0,0,0,1
665233,g,0,0,0,1
665240,f,0,1,0,0
665241,f,0,1,0,0


#### Binary Encoding 데이터 필터링 결과

In [20]:
data_be.filter(like='car').drop_duplicates().sort_values(by='car_value').reset_index(drop = True)

Unnamed: 0,car_value,car_value_0,car_value_1,car_value_2,car_value_3
0,a,1,0,0,1
1,b,1,0,0,0
2,c,0,1,0,1
3,d,0,0,1,1
4,e,0,0,1,0
5,f,0,1,0,0
6,g,0,0,0,1
7,h,0,1,1,0
8,i,0,1,1,1


## 6. Frequency Encoding

- 범주형 feature 내 변수값들의 빈도수에 기반해 인코딩하는 방식.
- 범주형 feature의 빈도값이 target와의 연관이 클수록 모델에 가중치를 부여.
- Train 데이터로부터 학습된 값들로 Test 데이터의 피쳐가 인코딩 됨.
- Train 데이터와 Test 데이터의 분포가 다르다면 overfitting 발생.

In [21]:
data_fe = data.copy()

fe = data_fe.groupby('car_value').size() / len(data_fe)
data_fe.loc[:, 'car_value_encoded'] = data_fe['car_value'].map(fe)
data_fe.filter(like = 'car')

Unnamed: 0,car_value,car_value_encoded
0,g,0.152111
1,g,0.152111
2,g,0.152111
3,g,0.152111
4,g,0.152111
...,...,...
665232,g,0.152111
665233,g,0.152111
665240,f,0.271764
665241,f,0.271764


#### Frequency Encoding 데이터 필터링 결과

In [22]:
data_fe.filter(like='car').drop_duplicates().sort_values(by='car_value').reset_index(drop = True)

Unnamed: 0,car_value,car_value_encoded
0,a,0.001476
1,b,0.001707
2,c,0.02856
3,d,0.165945
4,e,0.323405
5,f,0.271764
6,g,0.152111
7,h,0.04816
8,i,0.006871


## 7. Mean Encoding (=Target Encoding)

- 범주형 feature의 값을 학습데이터의 target 값의 평균으로 설정. 
- 범주형 피쳐가 target 값과 상관관계가 있음을 가정하는 방범.
- kaggle에서 많이 쓰임
- data의 크기가 커도 빠른 속도로 학습이 가능함. 
- tree-based 모델에서 유용

<유의할점>
- 구현과 검증이 까다로움 
- Train 데이터로부터 학습된 값들로 Test 데이터의 피쳐가 인코딩 됨.
- Train 데이터와 Test 데이터의 분포가 다르다면 overfitting 발생- > smoothing, cv로 보완
 

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- handle_missing: str
- handle_unknown: str
- min_samples_leaf: int
- smoothing: float
#### ex - (verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', handle_unknown='value', min_samples_leaf=1, smoothing=1.0)

In [23]:
data_me = data.copy()
data_me['car_value'] = data['car_value']
data_me['car_value_encoded'] = data_me['car_value']
y = data_me['record_type']
X = data_me[['car_value_encoded', 'car_value', 'state']]

me_encoder = ce.TargetEncoder(cols = ['car_value_encoded'])
data_me_tranformed = me_encoder.fit_transform(X, y)
data_me = pd.concat([data_me_tranformed, y], axis = 1)
data_me[['car_value', 'car_value_encoded', 'state', 'record_type']]

Unnamed: 0,car_value,car_value_encoded,state,record_type
0,g,0.148613,IN,0
1,g,0.148613,IN,0
2,g,0.148613,IN,0
3,g,0.148613,IN,0
4,g,0.148613,IN,0
...,...,...,...,...
665232,g,0.148613,FL,0
665233,g,0.148613,FL,1
665240,f,0.147511,FL,0
665241,f,0.147511,FL,0


#### Mean Encoding 데이터 필터링 결과

In [24]:
data_me.filter(like='car').drop_duplicates().sort_values(by='car_value').reset_index(drop = True)[['car_value', 'car_value_encoded']]

Unnamed: 0,car_value,car_value_encoded
0,a,0.156098
1,b,0.147679
2,c,0.149714
3,d,0.1456
4,e,0.147534
5,f,0.147511
6,g,0.148613
7,h,0.144766
8,i,0.139762


###  smoothing 
- mean encoding에서의 overfitting을 방지

In [25]:
mean = data_me['record_type'].mean()
agg = data_me.groupby('car_value')['record_type'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 100

smooth = (counts * means + weight * mean) / (counts + weight)
print(smooth)

data_me.loc[:, 'car_value_smean_encoded'] =data_me['car_value'].map(smooth)

car_value
a    0.154861
b    0.147628
c    0.149694
d    0.145602
e    0.147534
f    0.147511
g    0.148611
h    0.144778
i    0.140016
dtype: float64


#### Mean Encoding 데이터 필터링 결과(smoothing 기법 적용)

In [26]:
data_me.filter(like='car').drop_duplicates().sort_values(by='car_value').reset_index(drop = True)[['car_value', 'car_value_encoded', 'car_value_smean_encoded']]

Unnamed: 0,car_value,car_value_encoded,car_value_smean_encoded
0,a,0.156098,0.154861
1,b,0.147679,0.147628
2,c,0.149714,0.149694
3,d,0.1456,0.145602
4,e,0.147534,0.147534
5,f,0.147511,0.147511
6,g,0.148613,0.148611
7,h,0.144766,0.144778
8,i,0.139762,0.140016


## 8. Weight of Evidence Encoding(WoE)

- 대출채무불이행 위험의 예측을 위해 개발됨.
- 근거가 가설을 얼마나 뒷받침하는지를 측정함. 
- Logistic Regression에 적합함. -> Logistic Regression과 같은 스케일로 인코딩되기 때문
***
$$ WoE = \left[ln \left(\frac{P(1)}{P(0)}\right)\right] * 100$$
> if P(1) = P(0) , WoE = 0 -> random으로 도출 \
> if P(1) < P(0) , WoE < 0 \
> if P(1) > P(0) , WoE > 0 
- P(0)의 최소값을 0보다 큰 값으로 설정해, zero division error를 방지한다.

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- handle_missing: str
- handle_unknown: str
- randomized: bool,
- sigma: float
- regularization: float
#### ex - (verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, regularization=1.0)

In [27]:
data_woe = data.copy()
data_woe['car_value'] = data['car_value']
data_woe['car_value_encoded'] = data_woe['car_value']
y = data_woe['record_type']
X = data_woe[['car_value_encoded', 'car_value', 'state']]

woe_encoder = ce.WOEEncoder(cols = ['car_value_encoded'])
data_woe_tranformed = woe_encoder.fit_transform(X, y)
data_woe = pd.concat([data_woe_tranformed, y], axis = 1)
data_woe[['car_value', 'car_value_encoded', 'state', 'record_type']]

Unnamed: 0,car_value,car_value_encoded,state,record_type
0,g,0.010802,IN,0
1,g,0.010802,IN,0
2,g,0.010802,IN,0
3,g,0.010802,IN,0
4,g,0.010802,IN,0
...,...,...,...,...
665232,g,0.010802,FL,0
665233,g,0.010802,FL,1
665240,f,0.002027,FL,0
665241,f,0.002027,FL,0


In [28]:
data_woe['car_value_encoded'].value_counts()

 0.002198    134708
 0.002027    113198
-0.013224     69121
 0.010802     63359
-0.019741     20060
 0.019850     11896
-0.058942      2862
 0.011142       711
 0.077114       615
Name: car_value_encoded, dtype: int64

#### Weight of Evidence Encoding(WoE) 데이터 필터링 결과

In [29]:
data_woe.filter(like='car').drop_duplicates().sort_values(by='car_value').reset_index(drop = True)[['car_value', 'car_value_encoded']]

Unnamed: 0,car_value,car_value_encoded
0,a,0.077114
1,b,0.011142
2,c,0.01985
3,d,-0.013224
4,e,0.002198
5,f,0.002027
6,g,0.010802
7,h,-0.019741
8,i,-0.058942


## 9. Probability Ratio of encoding

 - WoE 방법과 유사하지만 log를 취하지 않은 P(1), P(0) 확률의 비율만을 사용함. 

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- handle_missing: str
- handle_unkvariableslistnown: str
- variables: list
- encoding_method: str

In [30]:
#!pip install feature_engine

In [31]:
from feature_engine.encoding import PRatioEncoder

In [32]:
data_pro = data.copy()
data_pro['car_value'] = data['car_value']
data_pro['car_value_encoded'] = data_pro['car_value']
y = data_pro['record_type']
X = data_pro[['car_value_encoded', 'car_value', 'state']]

pro_encoder = PRatioEncoder(encoding_method = 'ratio',  variables = ['car_value_encoded'])
data_pro_tranformed = pro_encoder.fit_transform(X, y)
data_pro = pd.concat([data_pro_tranformed, y], axis = 1)
data_pro[['car_value', 'car_value_encoded', 'state', 'record_type']]

Unnamed: 0,car_value,car_value_encoded,state,record_type
0,g,0.174555,IN,0
1,g,0.174555,IN,0
2,g,0.174555,IN,0
3,g,0.174555,IN,0
4,g,0.174555,IN,0
...,...,...,...,...
665232,g,0.174555,FL,0
665233,g,0.174555,FL,1
665240,f,0.173036,FL,0
665241,f,0.173036,FL,0


#### Probability Ratio of encoding 데이터 필터링 결과

In [33]:
data_pro.filter(like='car').drop_duplicates().sort_values(by='car_value').reset_index(drop = True)[['car_value', 'car_value_encoded']]

Unnamed: 0,car_value,car_value_encoded
0,a,0.184971
1,b,0.173267
2,c,0.176075
3,d,0.170412
4,e,0.173067
5,f,0.173036
6,g,0.174555
7,h,0.16927
8,i,0.16247


## 10. Hashing encoding

- 다수의 변수를 한 번에 다차원 공간에 맵핑시켜서 인코딩하는 방식
- 많은 차원의 범주형 피쳐가 있더라도, 원하는 차원의 갯수의 더미변수로 표현이 가능함. 
- 차원의 갯수가 커질수록 정확도는 높아지고 ,연산량도 증가함. 


- 해석의 용이성이 떨어짐

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- hash_method: str
- max_process: int
- max_sample: int
- n_components: int
#### ex - (max_process=0, max_sample=0, verbose=0, n_components=8, cols=None, drop_invariant=False, return_df=True, hash_method='md5')

In [34]:
data_hash = data.copy()
data_hash['car_value'] = data['car_value']
data_hash['car_value_encoded'] = data_hash['car_value']
y = data_hash['record_type']
X = data_hash[['car_value_encoded', 'car_value', 'state']]

hash_encoder = ce.HashingEncoder(cols = 'car_value_encoded')
data_hash_tranformed = hash_encoder.fit_transform(X, y)
data_hash = pd.concat([data_hash_tranformed, y], axis = 1)
data_hash

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,car_value,state,record_type
0,0,0,0,0,0,1,0,0,g,IN,0
1,0,0,0,0,0,1,0,0,g,IN,0
2,0,0,0,0,0,1,0,0,g,IN,0
3,0,0,0,0,0,1,0,0,g,IN,0
4,0,0,0,0,0,1,0,0,g,IN,0
...,...,...,...,...,...,...,...,...,...,...,...
665232,0,0,0,0,0,1,0,0,g,FL,0
665233,0,0,0,0,0,1,0,0,g,FL,1
665240,0,0,0,0,0,0,0,1,f,FL,0
665241,0,0,0,0,0,0,0,1,f,FL,0


#### Hashing encoding 데이터 필터링 결과

In [35]:
data_hash.loc[:,:'car_value'].drop_duplicates().sort_values(by='car_value').reset_index(drop = True)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,car_value
0,0,1,0,0,0,0,0,0,a
1,0,0,0,0,0,0,0,1,b
2,0,0,0,1,0,0,0,0,c
3,0,0,0,0,0,1,0,0,d
4,0,0,1,0,0,0,0,0,e
5,0,0,0,0,0,0,0,1,f
6,0,0,0,0,0,1,0,0,g
7,0,1,0,0,0,0,0,0,h
8,0,1,0,0,0,0,0,0,i


## 11. Backward Difference Encoding

- 특정 범주형 변수에서 하나의 level에 대한 종속변수값들(y값)의 평균이 바로 이전의 level에 대한 종속변수값들의 평균값과 비교되어 계산이 되는 방법이다.
- 주로 nominal, ordinal 변수에 적합하다고 알려져 있다.

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- handle_unknown: str
- handle_missing: str  
#### ex - (verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value')

In [36]:
data_bde = data.copy()

bde_encoder = ce.BackwardDifferenceEncoder(cols=['car_value'])
data_bde_tranformed = bde_encoder.fit_transform(data_bde['car_value'])
data_bde = pd.concat([data_bde, data_bde_tranformed], axis = 1)
data_bde.filter(like='car')

Unnamed: 0,car_value,car_value_0,car_value_1,car_value_2,car_value_3,car_value_4,car_value_5,car_value_6,car_value_7
0,g,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
1,g,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
2,g,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
3,g,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
4,g,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
...,...,...,...,...,...,...,...,...,...
665232,g,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
665233,g,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
665240,f,0.111111,0.222222,0.333333,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
665241,f,0.111111,0.222222,0.333333,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111


#### Backward Difference Encoding 데이터 필터링 결과

In [37]:
data_bde.filter(like='car').drop_duplicates().sort_values(by='car_value').reset_index(drop = True)

Unnamed: 0,car_value,car_value_0,car_value_1,car_value_2,car_value_3,car_value_4,car_value_5,car_value_6,car_value_7
0,a,0.111111,0.222222,0.333333,0.444444,0.555556,0.666667,0.777778,0.888889
1,b,0.111111,0.222222,0.333333,0.444444,0.555556,0.666667,0.777778,-0.111111
2,c,0.111111,0.222222,0.333333,0.444444,-0.444444,-0.333333,-0.222222,-0.111111
3,d,0.111111,0.222222,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
4,e,0.111111,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
5,f,0.111111,0.222222,0.333333,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
6,g,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
7,h,0.111111,0.222222,0.333333,0.444444,0.555556,-0.333333,-0.222222,-0.111111
8,i,0.111111,0.222222,0.333333,0.444444,0.555556,0.666667,-0.222222,-0.111111


## 12. Leave One Out Encoding

- Mean encoding과 비슷하지만 이상치를 제외시킨 상태에서의 종속변수값들의 평균을 이용해 범주형 변수값들을 인코딩하는 방법이다.
- 이상치를 제거한다는 의미로 leave one out이라는 이름이 붙게된 듯 하다.

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- handle_missing: str
- handle_unknown: str
- sigma: float 
#### ex - (verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', random_state=None, sigma=None)

In [38]:
data_loo = data.copy()
data_loo['car_value'] = data['car_value']
data_loo['car_value_encoded'] = data_loo['car_value']
y = data_loo['record_type']
X = data_loo[['car_value_encoded', 'car_value', 'state']]

loo_encoder = ce.LeaveOneOutEncoder(cols=['car_value_encoded'], return_df=True)
data_loo_tranformed = loo_encoder.fit_transform(X, y)
data_loo = pd.concat([data_loo_tranformed, y], axis = 1)
data_loo[['car_value', 'car_value_encoded', 'state', 'record_type']]

Unnamed: 0,car_value,car_value_encoded,state,record_type
0,g,0.148616,IN,0
1,g,0.148616,IN,0
2,g,0.148616,IN,0
3,g,0.148616,IN,0
4,g,0.148616,IN,0
...,...,...,...,...
665232,g,0.148616,FL,0
665233,g,0.148600,FL,1
665240,f,0.147513,FL,0
665241,f,0.147513,FL,0


#### Leave One Out Encoding 데이터 필터링 결과

In [39]:
data_loo.drop(['state'], axis = 1).drop_duplicates().sort_values(by='car_value').reset_index(drop = True)[['car_value','car_value_encoded','record_type']]

Unnamed: 0,car_value,car_value_encoded,record_type
0,a,0.154723,1
1,a,0.156352,0
2,b,0.146479,1
3,b,0.147887,0
4,c,0.149643,1
5,c,0.149727,0
6,d,0.145602,0
7,d,0.145587,1
8,e,0.147528,1
9,e,0.147535,0


## 13. James-Stein Encoding

- 관측된 feature value와 관측되지 않은 feature value의 종속변수 값의 평균값들 중에서 다시 가중치를 부여한 평균값을 이용해 인코딩하는 방법이다.
- 이러한 방법은 전체적인 평균값(모평균)에 도달하기기 위해 위에서 구한 평균값들을 수축시킨다는 의미가 된다.
- 하지만 원본 데이터들의 분포가 졍규분포일 때만 적용이 유의미하다.

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- handle_missing: str
- handle_unknown: str
- model: str
- randomized: bool,
- sigma: float
#### ex - (verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', model='independent', random_state=None, randomized=False, sigma=0.05)

In [40]:
data_js = data.copy()
data_js['car_value'] = data['car_value']
data_js['car_value_encoded'] = data_js['car_value']
y = data_js['record_type']
X = data_js[['car_value_encoded', 'car_value', 'state']]

js_encoder = ce.JamesSteinEncoder(cols=['car_value_encoded'])
data_js_tranformed = js_encoder.fit_transform(X, y)
data_js = pd.concat([data_js_tranformed, y], axis = 1)
data_js[['car_value', 'car_value_encoded', 'state', 'record_type']]

Unnamed: 0,car_value,car_value_encoded,state,record_type
0,g,0.148104,IN,0
1,g,0.148104,IN,0
2,g,0.148104,IN,0
3,g,0.148104,IN,0
4,g,0.148104,IN,0
...,...,...,...,...
665232,g,0.148104,FL,0
665233,g,0.148104,FL,1
665240,f,0.147417,FL,0
665241,f,0.147417,FL,0


#### James-Stein Encoding 데이터 필터링 결과

In [41]:
data_js.filter(like='car').drop_duplicates().sort_values(by='car_value').reset_index(drop = True)[['car_value', 'car_value_encoded']]

Unnamed: 0,car_value,car_value_encoded
0,a,0.152701
1,b,0.147522
2,c,0.148787
3,d,0.146219
4,e,0.147431
5,f,0.147417
6,g,0.148104
7,h,0.145694
8,i,0.142513


## 14. M-estimator Encoding

- Mean encoding을 단순화시킨 버전이다.
- 하나의 하이퍼파라미터값인 m값이 존재하는데 이 m값은 정규화의 강도를 의미한다.
- 따라서 m값을 높게 부여할수록 더 강력하게 정규화시키면서 제한강도가 높아진다.
- 권장되는 m값의 범위는 1~100사이의 값이다.

#### 주요 파라미터

- verbose: int
- cols: list
- drop_invariant: bool
- return_df: bool
- handle_missing: str
- handle_unknown: str
- randomized: bool,
- sigma: float
- m: float
#### ex - (verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, m=1.0)

In [42]:
data_me = data.copy()
data_me['car_value'] = data['car_value']
data_me['car_value_encoded'] = data_me['car_value']
y = data_me['record_type']
X = data_me[['car_value_encoded', 'car_value', 'state']]

me_encoder = ce.MEstimateEncoder(cols=['car_value_encoded'], m=2)
data_me_tranformed = me_encoder.fit_transform(X, y)
data_me = pd.concat([data_me_tranformed, y], axis = 1)
data_me[['car_value', 'car_value_encoded', 'state', 'record_type']]

Unnamed: 0,car_value,car_value_encoded,state,record_type
0,g,0.148613,IN,0
1,g,0.148613,IN,0
2,g,0.148613,IN,0
3,g,0.148613,IN,0
4,g,0.148613,IN,0
...,...,...,...,...
665232,g,0.148613,FL,0
665233,g,0.148613,FL,1
665240,f,0.147511,FL,0
665241,f,0.147511,FL,0


#### M-estimator Encoding 데이터 필터링 결과

In [43]:
data_me.filter(like='car').drop_duplicates().sort_values(by='car_value').reset_index(drop = True)[['car_value', 'car_value_encoded']]

Unnamed: 0,car_value,car_value_encoded
0,a,0.156069
1,b,0.147678
2,c,0.149714
3,d,0.1456
4,e,0.147534
5,f,0.147511
6,g,0.148613
7,h,0.144766
8,i,0.139768


## 15. Thermometer Encoding

- Unary Encoding이라고도 불림
- 이산 확률 분포에 최적화된 인코딩 방식
- One-Hot Encoding과 비슷하게 표현하고 싶은 값의 인덱스에 1의 값을 부여하고, 다른 인덱스에는 0을 부여하는 단어의 벡터 표현 방식을 사용


In [44]:
#!pip install thermoencoder

In [45]:
from thermoencoder import ThermoEncoder

In [46]:
data_te = data_oe.filter(like='car').drop_duplicates().sort_values(by='car_value_encoded').reset_index(drop = True)

te_encoder = ThermoEncoder()
data_te_tranformed = te_encoder.fit_transform(data_te['car_value_encoded'])
data_te_tranformed

array([[1, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1]], dtype=uint8)

In [47]:
data_te_pd = pd.DataFrame(data_te_tranformed, columns = ['car_value_a',
                                                         'car_value_b',
                                                         'car_value_c',
                                                         'car_value_d',
                                                         'car_value_e',
                                                         'car_value_f',
                                                         'car_value_h',
                                                         'car_value_i'])
data_te_pd['car_value'] = data_te['car_value']

#### Thermometer Encoding 데이터 필터링 결과

In [48]:
data_te_pd[['car_value',
            'car_value_a',
            'car_value_b',
            'car_value_c',
            'car_value_d',
            'car_value_e',
            'car_value_f',
            'car_value_h',
            'car_value_i']]

Unnamed: 0,car_value,car_value_a,car_value_b,car_value_c,car_value_d,car_value_e,car_value_f,car_value_h,car_value_i
0,a,1,0,0,0,0,0,0,0
1,b,1,1,0,0,0,0,0,0
2,c,1,1,1,0,0,0,0,0
3,d,1,1,1,1,0,0,0,0
4,e,1,1,1,1,1,0,0,0
5,f,1,1,1,1,1,1,0,0
6,g,1,1,1,1,1,1,1,0
7,h,1,1,1,1,1,1,1,1
8,i,1,1,1,1,1,1,1,1
