### 범주형 특징(변수)의 데이터 전처리

In [1]:
# ---------------------------------
# 데이터 등 준비
# ----------------------------------
import numpy as np
import pandas as pd

# train_x는 학습 데이터, train_y는 목적 변수, test_x는 테스트 데이터
# pandas의 DataFrame, Series의 자료형 사용(numpy의 array로 값을 저장하기도 함.)

train = pd.read_csv('../input/sample-data/train.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../input/sample-data/test.csv')


# 설명용으로 학습 데이터와 테스트 데이터의 원래 상태를 복제해 두기
train_x_saved = train_x.copy()
test_x_saved = test_x.copy()


# 학습 데이터와 테스트 데이터를 반환하는 함수
def load_data():
    train_x, test_x = train_x_saved.copy(), test_x_saved.copy()
    return train_x, test_x

# 변환할 범주형 변수를 목록에 저장
cat_cols = ['sex', 'product', 'medical_info_b2', 'medical_info_b3']

### 데이터 전처리 - One-hot encoding  - Pandas이용
  * 위에서 지정한 범주형 특징에 대해 원핫 인코딩을 수행
  * Pandas의 get_dummies()를 이용하기

In [2]:
# -----------------------------------
# one-hot encoding
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------

# 학습 데이터와 테스트 데이터를 결합하여 get_dummies를 통한 원-핫 인코딩을 수행
all_x = pd.concat([train_x, test_x])
all_x = pd.get_dummies(all_x, columns=cat_cols)

# 학습 데이터와 테스트 데이터의 재분할
train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)

In [4]:
train_x[0:5]

Unnamed: 0,age,height,weight,amount,date,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,medical_info_c1,...,medical_info_b3_D,medical_info_b3_E,medical_info_b3_F,medical_info_b3_G,medical_info_b3_H,medical_info_b3_a,medical_info_b3_b,medical_info_b3_c,medical_info_b3_d,medical_info_b3_e
0,50,166.445608,65.016732,7000000,2015/2/3,134,202,1,11,1.0,...,0,0,0,0,0,0,0,0,0,0
1,68,164.334615,56.544217,7000000,2015/5/9,438,263,3,14,,...,1,0,0,0,0,0,0,0,0,0
2,77,167.462917,54.242267,6000000,2016/2/13,313,325,1,18,2.0,...,0,0,0,0,1,0,0,0,0,0
3,17,177.097725,71.147762,8000000,2015/7/6,342,213,2,11,2.0,...,0,0,1,0,0,0,0,0,0,0
4,62,158.165788,65.240697,9000000,2016/9/17,327,102,0,14,2.0,...,0,0,1,0,0,0,0,0,0,0


In [5]:
test_x[0:5]

Unnamed: 0,age,height,weight,amount,date,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,medical_info_c1,...,medical_info_b3_D,medical_info_b3_E,medical_info_b3_F,medical_info_b3_G,medical_info_b3_H,medical_info_b3_a,medical_info_b3_b,medical_info_b3_c,medical_info_b3_d,medical_info_b3_e
0,49,187.431987,81.008363,1000000,2016/12/6,302,212,1,10,3.0,...,0,1,0,0,0,0,0,0,0,0
1,79,171.63263,71.067812,2000,2016/9/3,197,469,0,14,3.0,...,0,0,0,0,0,0,0,0,0,0
2,78,163.543983,64.032098,4000000,2015/4/10,247,225,2,17,,...,0,0,0,0,0,0,0,0,0,1
3,26,150.391858,52.32291,1000000,2016/4/17,108,228,0,15,0.0,...,0,0,0,0,0,0,0,0,0,0
4,14,165.835167,67.008154,4000000,2015/1/26,181,90,2,11,0.0,...,0,0,0,1,0,0,0,0,0,0


### 데이터 전처리 - One-hot encoding - Scikit-learn이용
  * 위에서 지정한 범주형 특징에 대해 원핫 인코딩을 수행
  * scikit-learn의 OneHotEncoder를 이용하기

In [6]:
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoder로 인코딩
ohe = OneHotEncoder(sparse=False, categories='auto')
ohe.fit(train_x[cat_cols])

# 가변수의 컬럼명 생성
columns = []
for i, c in enumerate(cat_cols):
    columns += [f'{c}_{v}' for v in ohe.categories_[i]]

# 생성된 가변수를 데이터 프레임으로 변환
dummy_vals_train = pd.DataFrame(ohe.transform(train_x[cat_cols]), columns=columns)
dummy_vals_test = pd.DataFrame(ohe.transform(test_x[cat_cols]), columns=columns)

# 나머지 변수와의 결합
train_x = pd.concat([train_x.drop(cat_cols, axis=1), dummy_vals_train], axis=1)
test_x = pd.concat([test_x.drop(cat_cols, axis=1), dummy_vals_test], axis=1)

In [9]:
dummy_vals_train[0:5]

Unnamed: 0,sex_Female,sex_Male,product_A1,product_A2,product_A3,product_B1,product_B2,product_B3,product_C1,product_C2,...,medical_info_b3_D,medical_info_b3_E,medical_info_b3_F,medical_info_b3_G,medical_info_b3_H,medical_info_b3_a,medical_info_b3_b,medical_info_b3_c,medical_info_b3_d,medical_info_b3_e
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
dummy_vals_test[0:5]

Unnamed: 0,sex_Female,sex_Male,product_A1,product_A2,product_A3,product_B1,product_B2,product_B3,product_C1,product_C2,...,medical_info_b3_D,medical_info_b3_E,medical_info_b3_F,medical_info_b3_G,medical_info_b3_H,medical_info_b3_a,medical_info_b3_b,medical_info_b3_c,medical_info_b3_d,medical_info_b3_e
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### 데이터 전처리 - 레이블 인코딩(Label Encoding)
  * 위에서 지정한 범주형 특징에 대해 레이블 인코딩 수행

In [20]:
# 데이터 읽어오기
train_x, test_x = load_data()
train_x[cat_cols].head(5)

Unnamed: 0,sex,product,medical_info_b2,medical_info_b3
0,Male,D1,3,2
1,Female,A1,2,D
2,Male,A3,9,H
3,Male,B1,1,F
4,Female,A2,1,F


In [21]:
# -----------------------------------
from sklearn.preprocessing import LabelEncoder

# 범주형 변수를 for문 루프하여 반복적으로 레이블 인코딩 수행
for c in cat_cols:
    # 학습 데이터에 근거하여 정의한 후에 데이터 변환
    le = LabelEncoder()
    le.fit(train_x[c])
    train_x[c] = le.transform(train_x[c])
    test_x[c] = le.transform(test_x[c])

In [22]:
train_x[cat_cols]

Unnamed: 0,sex,product,medical_info_b2,medical_info_b3
0,1,9,2,1
1,0,0,1,7
2,1,2,3,11
3,1,3,0,9
4,0,1,0,9
...,...,...,...,...
9995,1,1,0,13
9996,0,8,0,8
9997,0,8,1,4
9998,0,6,0,9


### 데이터 전처리 - 특징 해싱(feature hashing)

In [32]:
# 데이터 읽어오기
train_x, test_x = load_data()

train_x[cat_cols]

Unnamed: 0,sex,product,medical_info_b2,medical_info_b3
0,Male,D1,3,2
1,Female,A1,2,D
2,Male,A3,9,H
3,Male,B1,1,F
4,Female,A2,1,F
...,...,...,...,...
9995,Male,A2,1,b
9996,Female,C3,1,E
9997,Female,C3,2,A
9998,Female,C1,1,F


In [33]:
from sklearn.feature_extraction import FeatureHasher

# 범주형 변수를 반복적으로 특징 해싱 처리
for c in cat_cols:
    # FeatureHasher의 사용법은 다른 encoder와 조금 달라짐
    fh = FeatureHasher(n_features=5, input_type='string')

    # 변수를 문자열로 변환한 후 FeatureHasher 적용
    hash_train = fh.transform(train_x[[c]].astype(str).values)
    hash_test = fh.transform(test_x[[c]].astype(str).values)

    # 데이터 프레임으로 변환
    hash_train = pd.DataFrame(hash_train.todense(), columns=[f'{c}_{i}' for i in range(5)])
    hash_test = pd.DataFrame(hash_test.todense(), columns=[f'{c}_{i}' for i in range(5)])

    # 원래의 데이터 프레임과 결합
    train_x = pd.concat([train_x, hash_train], axis=1)
    test_x = pd.concat([test_x, hash_test], axis=1)

# 원래의 범주형 변수 삭제
train_x.drop(cat_cols, axis=1, inplace=True)
test_x.drop(cat_cols, axis=1, inplace=True)

In [34]:
hash_train[0:5]

Unnamed: 0,medical_info_b3_0,medical_info_b3_1,medical_info_b3_2,medical_info_b3_3,medical_info_b3_4
0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,-1.0,0.0
4,0.0,0.0,0.0,-1.0,0.0


In [35]:
hash_test[0:5]

Unnamed: 0,medical_info_b3_0,medical_info_b3_1,medical_info_b3_2,medical_info_b3_3,medical_info_b3_4
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0


### 데이터 전처리 - frequency encoding

In [38]:
# -----------------------------------
# frequency encoding
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
train_x[cat_cols]

Unnamed: 0,sex,product,medical_info_b2,medical_info_b3
0,Male,D1,3,2
1,Female,A1,2,D
2,Male,A3,9,H
3,Male,B1,1,F
4,Female,A2,1,F
...,...,...,...,...
9995,Male,A2,1,b
9996,Female,C3,1,E
9997,Female,C3,2,A
9998,Female,C1,1,F


In [40]:
# -----------------------------------
# for문을 이용한 변수를 반복하여 프리퀀시 인코딩 수행
for c in cat_cols:
    freq = train_x[c].value_counts()
    # 카테고리 출현 횟수로 치환
    train_x[c] = train_x[c].map(freq)
    test_x[c] = test_x[c].map(freq)

In [41]:
train_x[cat_cols]

Unnamed: 0,sex,product,medical_info_b2,medical_info_b3
0,6023,1013,1988,580
1,3977,1977,2484,586
2,6023,2002,531,630
3,6023,395,4997,607
4,3977,1010,4997,607
...,...,...,...,...
9995,6023,1010,4997,597
9996,3977,1017,4997,569
9997,3977,1017,2484,575
9998,3977,996,4997,607


### 데이터 전처리 - target encoding

In [43]:
# -----------------------------------
# target encoding
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
train_x[cat_cols]

Unnamed: 0,sex,product,medical_info_b2,medical_info_b3
0,Male,D1,3,2
1,Female,A1,2,D
2,Male,A3,9,H
3,Male,B1,1,F
4,Female,A2,1,F
...,...,...,...,...
9995,Male,A2,1,b
9996,Female,C3,1,E
9997,Female,C3,2,A
9998,Female,C1,1,F


In [44]:
from sklearn.model_selection import KFold

# for문을 이용한 변수를 반복하여 타깃 인코딩 수행
for c in cat_cols:
    # 학습 데이터 전체에서 각 범주별 타깃 평균을 계산
    data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
    target_mean = data_tmp.groupby(c)['target'].mean()

    # 테스트 데이터의 카테고리 변경
    test_x[c] = test_x[c].map(target_mean)

    # 학습 데이터 변환 후 값을 저장하는 배열을 준비
    tmp = np.repeat(np.nan, train_x.shape[0])

    # 학습 데이터 분할
    kf = KFold(n_splits=4, shuffle=True, random_state=72)
    for idx_1, idx_2 in kf.split(train_x):
        # 아웃 오브 폴드로 각 범주형 목적변수 평균 계산
        target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
        # 변환 후의 값을 날짜 배열에 저장
        tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean)

    # 변환 후의 데이터로 원래의 변수를 변경
    train_x[c] = tmp

In [45]:
train_x[cat_cols]

Unnamed: 0,sex,product,medical_info_b2,medical_info_b3
0,0.179885,0.338603,0.177672,0.222477
1,0.216786,0.149284,0.181868,0.176744
2,0.176353,0.152451,0.174807,0.197425
3,0.183714,0.195364,0.201710,0.178330
4,0.210985,0.190217,0.200481,0.169528
...,...,...,...,...
9995,0.179943,0.223514,0.201437,0.197309
9996,0.207378,0.162581,0.190451,0.167442
9997,0.216786,0.165138,0.181868,0.216092
9998,0.216786,0.166439,0.201437,0.185185


In [46]:
# -----------------------------------
# target encoding - 교차 검증의 각 fold의 경우
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
from sklearn.model_selection import KFold

# 교차 검증 폴드마다 타깃 인코딩 다시 적용
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):

    # 학습 데이터에서 학습 데이터와 검증 데이터 구분
    tr_x, va_x = train_x.iloc[tr_idx].copy(), train_x.iloc[va_idx].copy()
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    # 변수를 반복하여 타깃 인코딩 수행
    for c in cat_cols:
        # 학습 데이터 전체에서 각 범주별 타깃 평균을 계산
        data_tmp = pd.DataFrame({c: tr_x[c], 'target': tr_y})
        target_mean = data_tmp.groupby(c)['target'].mean()
        # 검증 데이터의 카테고리 치환
        va_x.loc[:, c] = va_x[c].map(target_mean)

        # 학습 데이터 변환 후 값을 저장하는 배열 준비
        tmp = np.repeat(np.nan, tr_x.shape[0])
        kf_encoding = KFold(n_splits=4, shuffle=True, random_state=72)
        for idx_1, idx_2 in kf_encoding.split(tr_x):
            # 아웃 오브 폴드에서 각 범주별 목적변수 평균 계산
            target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
            # 변환 후 값을 날짜 배열에 저장
            tmp[idx_2] = tr_x[c].iloc[idx_2].map(target_mean)

        tr_x.loc[:, c] = tmp

    # 필요에 따라 encode된 특징을 저장하고 나중에 읽을 수 있도록 해둠.

In [48]:
tr_x[cat_cols]

Unnamed: 0,sex,product,medical_info_b2,medical_info_b3
0,0.180409,0.334459,0.184423,0.205279
1,0.208222,0.156028,0.176851,0.185976
3,0.180409,0.186364,0.201909,0.171598
5,0.208222,0.206226,0.193577,0.165730
6,0.180409,0.148376,0.184423,0.178462
...,...,...,...,...
9994,0.178329,0.156028,0.193577,0.165730
9995,0.180362,0.222621,0.199930,0.190909
9997,0.212955,0.176471,0.188706,0.216463
9998,0.213605,0.174377,0.201909,0.171598


In [49]:
# -----------------------------------
# target encoding - 교차 검증의 fold와 target encoding의 fold 분할을 맞추는 경우
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
from sklearn.model_selection import KFold

# 교차 검증의 폴드를 정의
kf = KFold(n_splits=4, shuffle=True, random_state=71)

# 변수를 루프하여 타깃 인코딩 수행
for c in cat_cols:

    # 타깃을 추가
    data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
    # 변환 후 값을 저장하는 배열을 준비
    tmp = np.repeat(np.nan, train_x.shape[0])

    # 학습 데이터에서 검증 데이터를 나누기
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        # 학습 데이터에 대해 각 범주별 목적변수 평균 계산
        target_mean = data_tmp.iloc[tr_idx].groupby(c)['target'].mean()
        # 검증 데이터에 대해 변환 후 값을 날짜 배열에 저장
        tmp[va_idx] = train_x[c].iloc[va_idx].map(target_mean)

    # 변환 후의 데이터로 원래의 변수를 변경
    train_x[c] = tmp

In [50]:
train_x[cat_cols]

Unnamed: 0,sex,product,medical_info_b2,medical_info_b3
0,0.178905,0.330214,0.189262,0.190255
1,0.209581,0.147410,0.177454,0.188209
2,0.178587,0.150495,0.173028,0.197959
3,0.182860,0.164948,0.202663,0.182609
4,0.210438,0.205479,0.198779,0.163043
...,...,...,...,...
9995,0.179538,0.209974,0.192907,0.215385
9996,0.210438,0.160875,0.198779,0.168293
9997,0.207749,0.162019,0.196078,0.193103
9998,0.209581,0.173973,0.199680,0.180617
