# 원 핫 인코딩 (범주형 데이터 다루기)
- 머신러닝이나 딥러닝 알고리즘은 수치로 된 데이터만 이해할 수 있어 범주형 데이터를 수치형 데이터로 변환해준다
- 원 핫 인코딩 = 해당되는 한 데이터만 1로 변경하고 나머지는 0으로 채우는 것
---
- 파이썬 코드로 직접 구현하거나 pandas 나 scikitlearn을 사용할 수도 있다
- Kaggle의 타이타닉 데이터 다운로드 : https://www.kaggle.com/c/titanic/data

In [2]:
import numpy as np
import pandas as pd

print(pd.__version__)
print(np.__version__)

0.24.1
1.16.2


In [3]:
# 타이타닉 데이터 로드
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [4]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
test.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [10]:
# 수치형 데이터 정보를 볼 수 있다
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


- 오브젝트 타입의 데이터만 따로 추출해 본다
- 이 중 카테고리 형태의 데이터가 무엇인지 보고 인코딩한다
- 원 핫 인코딩 뿐만 하니라 NLP에서 사용했던 TF, TF-IDF의 인코딩도 가능하므로 어떠한 인코딩이 적합할지 생각해 본다

In [11]:
obj_df = train.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


- 누락된 데이터가 있으면 출력하도록
- Cabin이 누락된 데이터 가장 많음 (결측치 처리는 따로 다룰 것)

In [12]:
obj_df[obj_df.isnull().any(axis=1)].head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
4,"Allen, Mr. William Henry",male,373450,,S
5,"Moran, Mr. James",male,330877,,Q
7,"Palsson, Master. Gosta Leonard",male,349909,,S


In [13]:
# 카테고리 데이터로 적합한지 확인
obj_df['Cabin'].value_counts().head()

G6             4
C23 C25 C27    4
B96 B98        4
E101           3
D              3
Name: Cabin, dtype: int64

In [14]:
# 처리 전 후 비교하기 위해 카피본 만든다
train_c_df = train.copy()
test_c_df = test.copy()

## 성별

In [15]:
train['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [16]:
# 바이너리 값이므로 원 핫 인코딩 적용하기 편리
train.loc[train['Sex']=='male', 'Sex'] = 0
train.loc[train['Sex']=='female', 'Sex'] = 1

test.loc[test['Sex']=='male', 'Sex'] = 0
test.loc[test['Sex']=='female', 'Sex'] = 1

# apply함수를 이용한 것
# train['Sex'] = train['Sex'].apply(lambda s: 1 if s == 'female' else 0)
# test['Sex'] = test['Sex'].apply(lambda s: 1 if s == 'female' else 0)

In [17]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


## 사이킷런의 LabelEncoder로 원 핫 인코딩

In [18]:
from sklearn.preprocessing import LabelEncoder

# 성별을 0과 1로 인코딩
def gender_to_int(data):
    le = LabelEncoder()
    le.fit(['male', 'female'])
    data['Sex'] = le.transform(data['Sex'])
    return data

In [19]:
gender_to_int(train_c_df)
gender_to_int(test_c_df)
test_c_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S


## 승선 위치

In [20]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [21]:
train_c_df['Embarked_C'] = (train_c_df['Embarked'] == 'C')
train_c_df['Embarked_S'] = (train_c_df['Embarked'] == 'S')
train_c_df['Embarked_Q'] = (train_c_df['Embarked'] == 'Q')

print(train.shape)
print(train_c_df.shape)

(891, 12)
(891, 15)


In [22]:
train_c_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_C,Embarked_S,Embarked_Q
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,False,True,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S,False,True,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,False,True,False
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,False,True,False


# 판다스의 get_dummies로 원 핫 인코딩

In [23]:
# 판다스의 concat함수는 R에서의 rbind와 cbind와 유사하다
# concat(data1, data2, ... , axis=0) : 데이터를 위 아래로 합침
# concat(data1, data2, ... , axis=1) : 데이터를 왼쪽 오른쪽으로 합침
def dummy_data(data, columns):
    for column in columns:
        # get_dummies에 prefix는 생성할 칼럼에 공통적으로 적용되는 접두사를 만드는 것
        data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
        # 원 핫 인코딩한 기존의 칼럼을 삭제
        data = data.drop(column, axis=1)
    return data

In [24]:
dummy_columns = ['Sex', 'Pclass', 'Embarked']
train_dummy = dummy_data(train, dummy_columns)
test_dummy = dummy_data(test, dummy_columns)

In [25]:
print('원핫인코딩 전 shape')
print(train.shape)
print(test.shape)

print('get_dummies로 원핫인코딩 후 shape')
print(train_dummy.shape)
print(test_dummy.shape)

원핫인코딩 전 shape
(891, 12)
(418, 11)
get_dummies로 원핫인코딩 후 shape
(891, 17)
(418, 16)


In [26]:
train_dummy.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_0,Sex_1,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,0,0,0,1,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,1,1,0,0,1,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,1,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,1,1,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,0,0,0,1,0,0,1


In [27]:
# 인코딩한 데이터를 그대로 사용한다면 사용하지 않는 칼럼을 drop하는 방법으로 feature를 생성

def drop_not_concerned(data, columns):
    return data.drop(columns, axis=1)

In [28]:
not_concerned_columns = ["PassengerId", "Name", "Ticket", "Cabin"]
X_train = drop_not_concerned(train_dummy, not_concerned_columns)
X_train = X_train.drop('Survived', axis=1)
X_test = drop_not_concerned(test_dummy, not_concerned_columns)

In [29]:
X_train.head(3)

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_0,Sex_1,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,22.0,1,0,7.25,1,0,0,0,1,0,0,1
1,38.0,1,0,71.2833,0,1,1,0,0,1,0,0
2,26.0,0,0,7.925,0,1,0,0,1,0,0,1


In [30]:
X_test.tail(3)

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_0,Sex_1,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
415,38.5,0,0,7.25,1,0,0,0,1,0,0,1
416,,0,0,8.05,1,0,0,0,1,0,0,1
417,,1,1,22.3583,1,0,0,0,1,1,0,0
