In [1]:
!pip install category_encoders




[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [2]:
import numpy as np 
import pandas as pd

import category_encoders as ce

# 사용법

## One Hot Encoding

- one hot encoding은 범주형을 0,1로 이루어진 컬럼으로 바꿔주는 인코딩 방식
- 가장 많이 사용하면 대부분 다 해결됨
- 단점 : 피처의 항목이 많은 경우 차원의 저주에 빠질 수 있다.

In [24]:
data = {'color':['Red','Blue','Green']}
df= pd.DataFrame(data)
df.head()

Unnamed: 0,color
0,Red
1,Blue
2,Green


In [25]:
encoder = ce.OneHotEncoder(use_cat_names=True) # 인코딩 객체 만들어짐 (use_cat_names는 기존 컬럼명 유지 용도)

df_encoded = encoder.fit_transform(df) # df를 인코딩

df_encoded.head()

Unnamed: 0,color_Red,color_Blue,color_Green
0,1,0,0
1,0,1,0
2,0,0,1


## mean encoing

In [26]:
data = {'Pincode': ['753001', '753002', '753003', '753001', '753004', '753002', '753002', '753001', '753003']
        , 'O/P': [1, 1, 0, 0, 1, 0, 1, 0, 1]}
df2 = pd.DataFrame(data) 
df2.head(3)

Unnamed: 0,Pincode,O/P
0,753001,1
1,753002,1
2,753003,0


In [27]:
group_mean = df2.groupby('Pincode')['O/P'].mean()
group_mean

Pincode
753001    0.333333
753002    0.666667
753003    0.500000
753004    1.000000
Name: O/P, dtype: float64

In [28]:
df2['Mean'] = df2['Pincode'].map(group_mean)
df2.head()

Unnamed: 0,Pincode,O/P,Mean
0,753001,1,0.333333
1,753002,1,0.666667
2,753003,0,0.5
3,753001,0,0.333333
4,753004,1,1.0


## Target Encoding

In [29]:
data = {'Column': ['Btech', 'PHD', 'Masters', 'High School', 'PHD', 'Btech', 'Masters', 'High School', 'High School']
        , 'O/P': [1, 1, 0, 0, 1, 0, 0, 0, 1]}
df = pd.DataFrame(data) 
df

Unnamed: 0,Column,O/P
0,Btech,1
1,PHD,1
2,Masters,0
3,High School,0
4,PHD,1
5,Btech,0
6,Masters,0
7,High School,0
8,High School,1


In [30]:
encoder = ce.TargetEncoder()

df_encoded = encoder.fit_transform(df['Column'], df['O/P'])
df['encoded'] = df_encoded['Column']
df['Rank'] = df['encoded'].rank(method='dense', ascending=False)

df.head(3)

Unnamed: 0,Column,O/P,encoded,Rank
0,Btech,1,0.452325,2.0
1,PHD,1,0.523251,1.0
2,Masters,0,0.3814,4.0


## Label Encoding(별로 안좋음)

In [31]:
data = {'column': ['Btech', 'Masters', 'High School', 'PHD']}
df = pd.DataFrame(data) 
df.head()

Unnamed: 0,column
0,Btech
1,Masters
2,High School
3,PHD


In [32]:
from sklearn.preprocessing import LabelEncoder

In [33]:
encoder = LabelEncoder()

df['column_encoded'] = encoder.fit_transform(df['column'])

df.head()

Unnamed: 0,column,column_encoded
0,Btech,0
1,Masters,2
2,High School,1
3,PHD,3


In [34]:
df['column'].unique()

array(['Btech', 'Masters', 'High School', 'PHD'], dtype=object)

## Ordinal Encoding

In [35]:
df = pd.DataFrame(
    {'Fruit': ['시과', '딸기', '바나나', '수박', '포도',
               '메론','자두','체리','화이트베리', '무화과'],
     'color':['red1','red2','yellow','red','purple','green','light red','pink','white','brown'],
     'price': [2000,300,400, 30000, 150, 8000,1000,100,300,800]})

df.head()

Unnamed: 0,Fruit,color,price
0,시과,red1,2000
1,딸기,red2,300
2,바나나,yellow,400
3,수박,red,30000
4,포도,purple,150


In [36]:
encoder = ce.OrdinalEncoder(cols = 'color')

df_encoded = encoder.fit_transform(df)
df_encoded.head()

Unnamed: 0,Fruit,color,price
0,시과,1,2000
1,딸기,2,300
2,바나나,3,400
3,수박,4,30000
4,포도,5,150


# 예제

In [37]:
result = []

In [38]:
from sklearn.tree import DecisionTreeClassifier
SEED = 42


In [39]:
import seaborn as sns

df = sns.load_dataset('titanic')
cols = ["age","sibsp","parch","fare","pclass","sex","embarked", "survived"]
df = df[cols]
df.shape 

(891, 8)

### 데이터 분리

In [48]:
from sklearn.model_selection import train_test_split
SEED=42

train, test = train_test_split(df,random_state=SEED,test_size=0.2)

train.shape, test.shape


((712, 8), (179, 8))

### 결측치 제거

In [41]:
train['age'] = train['age'].fillna(train.age.mean())
test['age'] = test['age'].fillna(train.age.mean())

train['embarked'] = train['embarked'].fillna(train.embarked.mode().values[0])
test['embarke'] = test['embarked'].fillna(train.embarked.mode().values[0])

train.isnull().sum().sum() # encoding 전에 찍어 봐야함 

0

In [42]:
# 나누고 feature 과 target으로 나눔 (X_tr , Y_tr 나누는것과 비슷)
cols = ['age','fare']
features_tr = train[cols]
target_tr = train['survived']
features_te = test[cols]
target_te = test['survived']

features_tr.shape, target_tr.shape

((712, 2), (712,))

In [79]:
cols_encoding = ["pclass","sex","embarked","sibsp","parch"]
tmp_tr = train[cols_encoding]  
tmp_te = test[cols_encoding]  
# 수치형 들만 따로 빼는데 sex, embakred는 포함 
tmp_tr.shape , tmp_te.shape 

((712, 5), (179, 5))

In [80]:
tmp_tr['sex'] = tmp_tr['sex'].map({'male':1, 'female':0})
tmp_tr['embarked'] = tmp_tr['embarked'].map({'S':2, 'C':1, 'Q':0})

tmp_te['sex'] = tmp_te['sex'].map({'male':1, 'female':0})
tmp_te['embarked'] = tmp_te['embarked'].map({'S':2, 'C':1, 'Q':0})

# sex,embarked를 수치형으로 변경 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_tr['sex'] = tmp_tr['sex'].map({'male':1, 'female':0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_tr['embarked'] = tmp_tr['embarked'].map({'S':2, 'C':1, 'Q':0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_te['sex'] = tmp_te['sex'].map({'male':1, 'female':0})
A value is trying to b

In [81]:
tmp_tr.shape , tmp_te.shape

((712, 5), (179, 5))

## One hot Encoding

In [49]:
encoder = ce.OneHotEncoder(use_cat_names=True)

In [91]:
enco_tr = pd.DataFrame() # 깡통 만들기
enco_te = pd.DataFrame() # 깡통 만들기

for col in tmp_tr.columns:
  _enco = encoder.fit_transform(tmp_tr[col].astype('category'))
# 수치형 자료들을 카테고리 형태로 변환 시킨것 칼럼들을 encoding 적용
  enco_tr = pd.concat([enco_tr, _enco], axis=1)
# 변환된거 깡통에 업그레이드 ( for 문 자체에서 업데이트 붙이는게 꼭 필요하다 ) 
    
  _enco2 = encoder.transform(tmp_te[col].astype('category'))
  enco_te = pd.concat([enco_te, _enco2], axis=1)
# 변환된거 깡통에 업그레이드 ( for 문 자체에서 업데이트 붙이는게 꼭 필요하다 ) 
    
# print(f'{enco_tr.shape} / {enco_te.shape}')

In [88]:
_enco.shape

(712, 7)

In [89]:
_enco2.shape

(179, 7)

In [92]:
enco_tr.shape

(712, 23)