##### 범주형 변수(categorical variable)를 수치형 변수(Numerical variable)로
##### 특성 공학 : 특정 애플리케이션에 가장 적합한 데이터 표현으 찾는 것
### 원-핫-인코딩(one-hot-encoding)
- n개의 범주형 데이터를 n개의 비트(0,1) 벡터로 표현
- 범주형 변수를 표현하는데 가장 널리 쓰이는 방법
- one-out-of-n-encoding 또는 가변수라고 도 한다

In [15]:
import os
import pandas as pd
# "names" 매개변수로 열 이름을 제공합니다
data = pd.read_csv('../data/adult.csv', header=1, index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])
# 예제를 위해 몇개의 열만 선택합니다
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week',
             'occupation', 'income']]
# IPython.display 함수는 주피터 노트북을 위해 포맷팅된 출력을 만듭니다
display(data.head())

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
1,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
2,53,Private,11th,Male,40,Handlers-cleaners,<=50K
3,28,Private,Bachelors,Female,40,Prof-specialty,<=50K
4,37,Private,Masters,Female,40,Exec-managerial,<=50K


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31142 entries, 0 to 31141
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             31142 non-null  int64 
 1   workclass       31142 non-null  object
 2   education       31142 non-null  object
 3   gender          31142 non-null  object
 4   hours-per-week  31142 non-null  int64 
 5   occupation      31142 non-null  object
 6   income          31142 non-null  object
dtypes: int64(2), object(5)
memory usage: 1.7+ MB


문자열로된 범주형 데이터 확인하기

In [16]:
print(data.gender.value_counts())

 Male      20839
 Female    10303
Name: gender, dtype: int64


In [17]:
print('원본 특성 : \n', list(data.columns), '\n')
# pd.get_dummies()는 객체 타입이나 범주형을 가진 열을 자동으로 변환
data_dummies = pd.get_dummies(data)
print('get_dummies 후의 특성 : \n', list(data_dummies.columns), '\n')

원본 특성 : 
 ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] 

get_dummies 후의 특성 : 
 ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupati

In [18]:
data_dummies.head()

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,38,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,53,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,28,40,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,37,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [19]:
# 원본 데이터는 칼럼이 7개
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31142 entries, 0 to 31141
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             31142 non-null  int64 
 1   workclass       31142 non-null  object
 2   education       31142 non-null  object
 3   gender          31142 non-null  object
 4   hours-per-week  31142 non-null  int64 
 5   occupation      31142 non-null  object
 6   income          31142 non-null  object
dtypes: int64(2), object(5)
memory usage: 1.7+ MB


In [20]:
# 더미 데이터의 칼럼 수는 46개
data_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31142 entries, 0 to 31141
Data columns (total 46 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   age                            31142 non-null  int64
 1   hours-per-week                 31142 non-null  int64
 2   workclass_ ?                   31142 non-null  uint8
 3   workclass_ Federal-gov         31142 non-null  uint8
 4   workclass_ Local-gov           31142 non-null  uint8
 5   workclass_ Never-worked        31142 non-null  uint8
 6   workclass_ Private             31142 non-null  uint8
 7   workclass_ Self-emp-inc        31142 non-null  uint8
 8   workclass_ Self-emp-not-inc    31142 non-null  uint8
 9   workclass_ State-gov           31142 non-null  uint8
 10  workclass_ Without-pay         31142 non-null  uint8
 11  education_ 10th                31142 non-null  uint8
 12  education_ 11th                31142 non-null  uint8
 13  education_ 12th 

In [21]:
# 모델 학습 전 데이터로부터 타겟 값(income)을 분리해야한다
features = data_dummies.loc[:, 'age':'occupation_ Transport-moving']
X = features.values
y = data_dummies['income_ <=50K'].values

print('X.shape : {}, y.shape : {}'.format(X.shape, y.shape))

X.shape : (31142, 44), y.shape : (31142,)


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('테스트 점수 : {:.2f}'.format(logreg.score(X_train, y_train)))

테스트 점수 : 0.81


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
from sklearn.preprocessing import OneHotEncoder

In [26]:
display(data.head())

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
1,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
2,53,Private,11th,Male,40,Handlers-cleaners,<=50K
3,28,Private,Bachelors,Female,40,Prof-specialty,<=50K
4,37,Private,Masters,Female,40,Exec-managerial,<=50K


In [27]:
# ColumnTransformer를 이용하여 연속형 변수인 age와 hours-per-week의스케일 조정
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ct = ColumnTransformer([('scaling', StandardScaler(), ['age', 'hours-per-week']), 
                        ('onehot', OneHotEncoder(sparse = False), 
                        (['workclass', 'education', 'gender', 'occupation']))])

In [28]:
ct

In [29]:
data_features = data.drop('income', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(data_features, data.income, random_state = 42)
ct.fit(X_train)
X_train_trans = ct.transform(X_train)
print(X_train_trans.shape)
# 44개 특성이 생성됨 + 연속형 특성 스케일도 조정됨

(23356, 44)




In [30]:
# 데이터 스케일이 영향을 미치지 못했다
logreg = LogisticRegression()
logreg.fit(X_train_trans, y_train)

X_test_trans = ct.transform(X_test)
logreg.score(X_test_trans, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.803108142820447

숫자로 표현된 범주형 특성

In [31]:
# 숫자 특성과 범주형 문자열 특성을 가진 DataFrame을 만듭니다
import pandas as pd
demo_df = pd.DataFrame({'숫자 특성':[0,1,2,1],
                        '범주형 특성':['양말', '여우', '양말', '상자']})
display(demo_df)

Unnamed: 0,숫자 특성,범주형 특성
0,0,양말
1,1,여우
2,2,양말
3,1,상자


In [32]:
# get_dummies를 사용하면 문자열 특성만 인코딩 되며 숫자 특성은 바뀌지 않는다
display(pd.get_dummies(demo_df))

Unnamed: 0,숫자 특성,범주형 특성_상자,범주형 특성_양말,범주형 특성_여우
0,0,0,1,0
1,1,0,0,1
2,2,0,1,0
3,1,1,0,0


In [33]:
# 숫자 특성도 가변수로 만들고 싶다면 칼럼 매개변수에 인코딩하고 싶은 열을 명시해야한다
demo_df['숫자 특성'] = demo_df['숫자 특성'].astype(str)
display(pd.get_dummies(demo_df, columns = ['숫자 특성', '범주형 특성']))

Unnamed: 0,숫자 특성_0,숫자 특성_1,숫자 특성_2,범주형 특성_상자,범주형 특성_양말,범주형 특성_여우
0,1,0,0,0,1,0
1,0,1,0,0,0,1
2,0,0,1,0,1,0
3,0,1,0,1,0,0


In [34]:
# OneHotEncoder와 ColumnsTransformer : sklearn으로 범주형 변수 다루기
# OneHotEncoder는 모든 열에 인코딩을 수행한다. (문자열, 정수 다 변환)

# get_dummies와 다르게 OneHotEncoding은 fit과 transform이 필요
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)  # 기본값 True, matrix를 반환. 배열을 반환 받기 위해서는 False로 지정
print(ohe.fit_transform(demo_df))

[[1. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 1. 0.]
 [0. 1. 0. 1. 0. 0.]]




In [36]:
# 변환된 특성에 해당하는 원본 범주형 변수 이름 가져오기
# print(ohe.get_feature_names_out())
print(ohe.get_feature_names_out())

['숫자 특성_0' '숫자 특성_1' '숫자 특성_2' '범주형 특성_상자' '범주형 특성_양말' '범주형 특성_여우']
