##### [ HIGH CARDINALITY ]
- 범주형 컬럼/ 피쳐에서 고유한 값 종류가 많고 중복은 적은 경우를 이르는 말
- ML에서 발생하는 문제
    * 차원의 저주 => 학습이 불안정/성능 저하 ★ 차원은 피쳐 개수
    * 과대적합 가능성이 높아짐
- 범주형 => 수치화 과정에서 OneHot인코딩 시 발생하기도 함

[1] 모듈 로딩 및 데이터 준비 <hr>

In [16]:
## ===========================================================
## 모듈 로딩
## ===========================================================
## 일반 모듈
import pandas as pd
import numpy as np

## ML관련 모듈들
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, LabelEncoder



In [17]:
## ==================================================
## 0. 공통 데이터 & Train/Test 분리
## ==================================================
df = pd.DataFrame({
    "job": [
        "office", "office", "office", "office",
        "engineer", "engineer", "engineer", "engineer",
        "teacher", "teacher", "teacher", "teacher"
    ],
    "city": [
        "Seoul", "Seoul", "Busan", "Busan",
        "Seoul", "Busan", "Seoul", "Busan",
        "Incheon", "Incheon", "Seoul", "Busan"
    ],
    "age": [25, 30, 45, 38, 40, 42, 36, 33, 28, 39, 41, 35],
    "high_income": [0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0]
})

## 기본 정보 확인
df.info()

## 피쳐와 타겟 컬럼 설정
X = df[['job', 'city', 'age']]
y = df[['high_income']]

## 범주형/수치형 컬럼이름 리스트
# Use X (features only) to derive column lists so target 'high_income' is not included in num_cols
cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(include='number').columns
print(f'cat_col => {cat_cols}, num_cols => {num_cols}')

## 학습용/테스토용 데이터셋 분리
x_train, x_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

## 학습용/테스트용 범주형과 수치형 데이터 복사
x_train_cat = x_train[cat_cols].copy()
x_test_cat = x_test[cat_cols].copy()

x_train_num = x_train[num_cols].copy()
x_test_num = x_test[num_cols].copy()

print('============ 원본 데이터 ==============')
print(x_train, "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   job          12 non-null     object
 1   city         12 non-null     object
 2   age          12 non-null     int64 
 3   high_income  12 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 516.0+ bytes
cat_col => Index(['job', 'city'], dtype='object'), num_cols => Index(['age'], dtype='object')
         job     city  age
1     office    Seoul   30
5   engineer    Busan   42
10   teacher    Seoul   41
11   teacher    Busan   35
9    teacher  Incheon   39
0     office    Seoul   25
7   engineer    Busan   33
2     office    Busan   45
3     office    Busan   38 



In [18]:
## ============================================================
## 1. One Hot Encoding
## ============================================================
# => 인스턴스 생성
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

## 학습용 데이터로 인코더 생성 + 변환
x_train_cat_ohe = ohe.fit_transform(x_train_cat)

## 테스트 데이터 변환
x_test_cat_ohe = ohe.transform(x_test_cat)



## => 컬럼명 설정
ohe_feature_names = ohe.get_feature_names_out(cat_cols)

## => 인코딩된 DF와 기존 DF 연결
x_train_cat_ohe = pd.DataFrame(x_train_cat_ohe,
                               index = x_train.index,
                               columns = ohe_feature_names)
x_test_cat_ohe = pd.DataFrame(x_test_cat_ohe,
                              index = x_test.index,
                              columns = ohe_feature_names)

x_train_ohe = pd.concat([x_train_cat_ohe, x_train_num], axis=1)
x_test_ohe = pd.concat([x_test_cat_ohe, x_test_num], axis=1)

print(" ===[1] One Hot Encoding ====")
print('Train shpae :', x_train_ohe.shape)
print('Test shpae :', x_test_ohe.shape)

display(X.head(2), x_test_ohe.head(2))




 ===[1] One Hot Encoding ====
Train shpae : (9, 7)
Test shpae : (3, 7)


Unnamed: 0,job,city,age
0,office,Seoul,25
1,office,Seoul,30


Unnamed: 0,job_engineer,job_office,job_teacher,city_Busan,city_Incheon,city_Seoul,age
8,0.0,0.0,1.0,0.0,1.0,0.0,28
4,1.0,0.0,0.0,0.0,0.0,1.0,40


In [19]:
## ============================================================
## 2. Target Encoding
## ============================================================
counts = y_train.value_counts()
min_count = counts.min()
cv_te = min(5, int(min_count))
print('TargetEncoder용 y_train 분포 : \n',counts)
print('선택된 cv: ', cv_te, "\n")


## -> 타겟인코더 인스턴스 생성
te = TargetEncoder(cv=cv_te, random_state=42)

## => 훈련용 인코더 생성 및 테스트용 데이터에 인코딩 적용
x_train_cat_te_np = te.fit_transform(x_train_cat, y_train)

## => 테스트용 데이터에도 인코딩 변환
x_test_cat_te_np = te.transform(x_test_cat)


## dusruf

## => 컬럼명 설정
te_feature_names = te.get_feature_names_out(cat_cols)

## => 인코딩된 DF와 기존 DF 연결
x_train_cat_te = pd.DataFrame(x_train_cat_te_np,
                               index = x_train.index,
                               columns = te_feature_names)
x_test_cat_te = pd.DataFrame(x_test_cat_te_np,
                              index = x_test.index,
                              columns = te_feature_names)

x_train_te = pd.concat([x_train_cat_te, x_train_num], axis=1)
x_test_te = pd.concat([x_test_cat_te, x_test_num], axis=1)

print(" ===[2] Target Encoding ====")
print('Train shpae :', x_train_te.shape)
print('Test shpae :', x_test_te.shape)

display(X.head(2), x_test_te.head(2))

TargetEncoder용 y_train 분포 : 
 high_income
1              6
0              3
Name: count, dtype: int64
선택된 cv:  3 

 ===[2] Target Encoding ====
Train shpae : (9, 3)
Test shpae : (3, 3)


  y = column_or_1d(y, warn=True)


Unnamed: 0,job,city,age
0,office,Seoul,25
1,office,Seoul,30


Unnamed: 0,job,city,age
8,0.666667,1.0,28
4,1.0,0.416667,40


In [21]:
## ===============================================
## [3] Frequency Encoding
## ===============================================
x_train_cat_freq = x_train_cat.copy()
x_test_cat_freq = x_test_cat.copy()

freq_map = {}
for col in cat_cols :
    # 범주형 고유값별 비율 계산
    vc = x_train_cat[col].value_counts(normalize=True)
    
    ## 비율값을 각 고육밧 인코딩의 수치값으로 사용
    freq_map[col] = vc
    x_train_cat_freq[col] = x_train_cat[col].map(vc).fillna(0)
    x_test_cat_freq[col] = x_test_cat[col].map(vc).fillna(0)
    
## 인코딩된 범주형 피쳐들과 다른 피쳐들 연결
x_train_freq = pd.concat([x_train_cat_freq, x_train_num], axis = 1)
x_test_freq = pd.concat([x_test_cat_freq, x_test_num], axis = 1)

print(" ===[3] Frequency Encoding ====")
print('Train shpae :', x_train_freq.shape)
print('Test shpae :', x_test_freq.shape)

display(X.head(2), x_test_freq.head(2))


 ===[3] Frequency Encoding ====
Train shpae : (9, 3)
Test shpae : (3, 3)


Unnamed: 0,job,city,age
0,office,Seoul,25
1,office,Seoul,30


Unnamed: 0,job,city,age
8,0.333333,0.111111,28
4,0.222222,0.333333,40


In [24]:
# ==================================================
# 4. Binary Encoding (LabelEncoder + 비트 분해)
#     범주 → LabelEncoder로 정수 → 이진수로 분해 → 여러 컬럼으로
# ==================================================
X_train_cat_bin = x_train_cat.copy()
X_test_cat_bin  = x_test_cat.copy()

for col in cat_cols:
    le = LabelEncoder()
    train_codes = le.fit_transform(X_train_cat_bin[col])
    test_codes  = le.transform(X_test_cat_bin[col])

    n_classes = len(le.classes_)
    n_bits = int(np.ceil(np.log2(max(n_classes, 2))))  # 최소 1비트

    for bit in range(n_bits):
        bit_col = f"{col}_bit{bit}"
        X_train_cat_bin[bit_col] = (train_codes >> bit) & 1
        X_test_cat_bin[bit_col]  = (test_codes  >> bit) & 1

    # 원래 범주형 컬럼 제거 (선택 사항)
    X_train_cat_bin = X_train_cat_bin.drop(columns=[col])
    X_test_cat_bin  = X_test_cat_bin.drop(columns=[col])

X_train_bin = pd.concat([X_train_cat_bin, x_train_num], axis=1)
X_test_bin  = pd.concat([X_test_cat_bin,  x_test_num],  axis=1)


print("=== [4] Binary Encoding ===")
print("Original shape:", X.shape, end=' ====> 인코딩 후 ')
print("Train shape:", X_train_bin.shape, end='  ')
print("Test  shape:", X_test_bin.shape)

display(X.head(2), X_train_bin.head(2))

=== [4] Binary Encoding ===
Original shape: (12, 3) ====> 인코딩 후 Train shape: (9, 5)  Test  shape: (3, 5)


Unnamed: 0,job,city,age
0,office,Seoul,25
1,office,Seoul,30


Unnamed: 0,job_bit0,job_bit1,city_bit0,city_bit1,age
1,1,0,0,1,30
5,0,0,0,0,42
