### 1. 원핫 인코딩 과 레이블 인코딩

- 레이블 인코딩: 문자열을 범주형 숫자값으로 변환
    - 기존의 feature engineering 이 이와 같은 작업을 진행한 것임
      - 일괄적으로 문자열을 레이블 인코딩해주는 함수도 **사이킷런에서 제공함 ( LabelEncoder() )**
      - 선형 회귀와 같이 숫자값에 의미를 부여하는 경우에는 레이블 인코딩이 결과 예측을 왜곡할 수 있음
         - 트리 관련 머신러닝 기법에서는 사용 가능
- 원핫 인코딩: 각 문자열 범주마다 **새로운 feature**를 만들고, 해당 범주에 해당하는 경우에만 값을 부여해주는 방식

In [1]:
import pickle
with open('titanic_step2_feature_add.pickle', 'rb') as pickle_filename:
    df = pickle.load(pickle_filename)

In [2]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Initial,Family,Ticket_initial2,Ticket_Num_Cut,HighChance,LowChance
0,1,0.0,3,0,3.0,0.0,U,0,0.0,2,0,3,0,0
1,2,1.0,1,1,4.0,3.0,C,1,2.0,2,1,3,2,0
2,3,1.0,3,1,3.0,0.0,U,0,1.0,1,2,9,0,0
3,4,1.0,1,1,4.0,2.0,C,0,2.0,2,3,5,2,0
4,5,0.0,3,0,4.0,0.0,U,0,0.0,1,3,9,0,2


### 라벨 인코딩과 원핫 인코딩 테스트

In [9]:
df_label = df.copy()
df_onehot = df.copy()
n_train = 891
train = df[:n_train] # (891, 14)
y_train = train[['Survived']].copy()

In [11]:
drop_features = ['PassengerId', 'Survived']
df_label = df_label.drop(drop_features, axis=1).copy()
df_label.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Initial,Family,Ticket_initial2,Ticket_Num_Cut,HighChance,LowChance
0,3,0,3.0,0.0,U,0,0.0,2,0,3,0,0
1,1,1,4.0,3.0,C,1,2.0,2,1,3,2,0
2,3,1,3.0,0.0,U,0,1.0,1,2,9,0,0
3,1,1,4.0,2.0,C,0,2.0,2,3,5,2,0
4,3,0,4.0,0.0,U,0,0.0,1,3,9,0,2


In [12]:
df_onehot = df_onehot.drop(drop_features, axis=1).copy()
df_onehot.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Initial,Family,Ticket_initial2,Ticket_Num_Cut,HighChance,LowChance
0,3,0,3.0,0.0,U,0,0.0,2,0,3,0,0
1,1,1,4.0,3.0,C,1,2.0,2,1,3,2,0
2,3,1,3.0,0.0,U,0,1.0,1,2,9,0,0
3,1,1,4.0,2.0,C,0,2.0,2,3,5,2,0
4,3,0,4.0,0.0,U,0,0.0,1,3,9,0,2


### Label Encoding

In [13]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
df_label = df_label.apply(LabelEncoder().fit_transform)
df_label.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Initial,Family,Ticket_initial2,Ticket_Num_Cut,HighChance,LowChance
0,2,0,3,0,8,0,0,1,0,3,0,0
1,0,1,4,3,2,1,2,1,1,3,2,0
2,2,1,3,0,8,0,1,0,2,9,0,0
3,0,1,4,2,2,0,2,1,3,5,2,0
4,2,0,4,0,8,0,0,0,3,9,0,2


### One-hot Encoding
- 범주가 많을 경우, 머신러닝 계산 시간이 매우 오래 걸리고, 불필요한 feature로 예측을 하기 때문에 성능이 오히려 안좋아진다.
- **따라서 feature별 중요도를 계산해서, 예측 성능에 유의미한 feature를 중심으로 하도록 만들어야 한다.**

In [14]:
import pandas as pd

onehot_cols = df_label.columns.tolist()
df_onehot = pd.get_dummies(df_label, columns=onehot_cols)
df_onehot.head()

Unnamed: 0,Pclass_0,Pclass_1,Pclass_2,Sex_0,Sex_1,Age_0,Age_1,Age_2,Age_3,Age_4,...,HighChance_0,HighChance_1,HighChance_2,HighChance_3,HighChance_4,HighChance_5,HighChance_6,LowChance_0,LowChance_1,LowChance_2
0,0,0,1,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
1,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
2,0,0,1,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
4,0,0,1,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1


In [15]:
import pickle
with open('titanic_step3_feature_onehot_encoding.pickle', 'wb') as pickle_filename:
    pickle.dump(df_onehot, pickle_filename)

with open('titanic_step3_feature_encoding_y_pickle', 'wb') as pickle_filename:
    pickle.dump(y_train, pickle_filename)