### 중요도에 따른 feature 정리
- 분류 확률을 계산하는데 기여한 정도를 **피처 중요도** 라고 한다.
- 결과에 유의미한 영향을 주는 feature만을 중심으로 머신러닝 기법을 적용하기도 한다.

In [3]:
import warnings
warnings.filterwarnings('ignore')

import pickle
with open('titanic_step3_feature_onehot_encoding.pickle', 'rb') as pickle_filename:
    df_onehot = pickle.load(pickle_filename)
with open('titanic_step3_feature_encoding_y_pickle', 'rb') as pickle_filename:
    y_train = pickle.load(pickle_filename)
ntrain = 891
X_train, X_test = df_onehot[:ntrain], df_onehot[ntrain:]
X_train.head()

Unnamed: 0,Pclass_0,Pclass_1,Pclass_2,Sex_0,Sex_1,Age_0,Age_1,Age_2,Age_3,Age_4,...,HighChance_0,HighChance_1,HighChance_2,HighChance_3,HighChance_4,HighChance_5,HighChance_6,LowChance_0,LowChance_1,LowChance_2
0,0,0,1,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
1,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
2,0,0,1,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
4,0,0,1,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1


In [5]:
import numpy as np # 각 모델에서 내부적으로 관련 라이브러리 사용 가능
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier             # 1. K-Nearest Neighbor(KNN)
from sklearn.linear_model import LogisticRegression            # 2. Logistic Regression
from sklearn.svm import SVC                                                # 3. SVC
from sklearn.tree import DecisionTreeClassifier                   # 4. Decision Tree
from sklearn.ensemble import RandomForestClassifier       # 5. Random Forest
from sklearn.ensemble import ExtraTreesClassifier             # 6. Extra Tree
from sklearn.ensemble import GradientBoostingClassifier  # 7. GBM
from sklearn.naive_bayes import GaussianNB                     # 8. GaussianNB
from xgboost import XGBClassifier                                     # 9. XGBoost
from lightgbm import LGBMClassifier

### default test

In [6]:
knn_model = KNeighborsClassifier()
logreg_model = LogisticRegression()
svc_model = SVC()
decision_model = DecisionTreeClassifier()
random_model = RandomForestClassifier()
extra_model = ExtraTreesClassifier()
gbm_model = GradientBoostingClassifier()
nb_model = GaussianNB()
xgb_model = XGBClassifier(eval_metric='logloss')
lgbm_model = LGBMClassifier()

models = [
    knn_model,
    logreg_model,
    svc_model,
    decision_model,
    random_model,
    extra_model,
    gbm_model,
    nb_model,
    xgb_model,
    lgbm_model
]

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)           # K-Fold 사용
results = dict()
for alg in models:
    alg.fit(X_train, y_train)
    score = cross_val_score(alg, X_train, y_train.values.ravel(), cv=k_fold, scoring='accuracy')
    results[alg.__class__.__name__] = np.mean(score)*100

In [7]:
results

{'KNeighborsClassifier': 83.27715355805243,
 'LogisticRegression': 83.50187265917603,
 'SVC': 83.05118601747814,
 'DecisionTreeClassifier': 80.1323345817728,
 'RandomForestClassifier': 82.03870162297129,
 'ExtraTreesClassifier': 81.92634207240947,
 'GradientBoostingClassifier': 82.48938826466915,
 'GaussianNB': 71.81647940074906,
 'XGBClassifier': 81.4769038701623,
 'LGBMClassifier': 82.14981273408239}

### 정확도 높은 순으로 정렬하기

In [8]:
sorted(results.items(), key=lambda  x:x[1], reverse=True)

[('LogisticRegression', 83.50187265917603),
 ('KNeighborsClassifier', 83.27715355805243),
 ('SVC', 83.05118601747814),
 ('GradientBoostingClassifier', 82.48938826466915),
 ('LGBMClassifier', 82.14981273408239),
 ('RandomForestClassifier', 82.03870162297129),
 ('ExtraTreesClassifier', 81.92634207240947),
 ('XGBClassifier', 81.4769038701623),
 ('DecisionTreeClassifier', 80.1323345817728),
 ('GaussianNB', 71.81647940074906)]

### 성능이 좋은 머신러닝 기법만으로 중요도 계산하기

In [9]:
tree_models = [
    random_model,
    extra_model,
    gbm_model,
    xgb_model
]

### 트리 관련 모델은 중요도가 측정됨
- 트리를 결정하는 과정에서 각 feature 가 얼마나 중요한지를 수치화하며, feature_importances_ 에 해당 값을 가지고 있음
- 해당 값을 기준으로 중요도가 낮은 feature 를 걸러낼 수 있음

In [10]:
for alg in tree_models:
    try:
        print(alg.__class__.__name__)
        print(alg.feature_importances_)
    except:
        print(alg.__class__.__name__, "X")

RandomForestClassifier
[1.47146345e-02 1.50214399e-02 3.58106121e-02 7.00953557e-02
 5.64699737e-02 4.65514321e-03 8.42237687e-03 1.29722822e-02
 2.03238846e-02 1.86534969e-02 1.58370585e-02 5.12774626e-03
 1.27993158e-03 0.00000000e+00 2.36012643e-02 1.65002056e-02
 9.38635601e-03 5.10694062e-03 4.38488246e-03 6.61711460e-03
 3.03510574e-03 7.57610904e-03 7.63437201e-03 8.20487258e-03
 9.31207005e-03 1.52031703e-03 1.43971286e-03 5.45697904e-04
 3.32015990e-02 1.84108129e-02 1.42373101e-02 1.02235152e-02
 9.59349654e-02 2.45839717e-02 2.28398113e-02 1.09572743e-02
 4.41863450e-05 1.79556536e-04 2.22005265e-03 1.98632636e-03
 9.01514567e-04 3.07596282e-04 5.77366901e-04 4.51189365e-04
 1.60021783e-04 7.66641513e-04 1.63614431e-02 1.35321362e-02
 1.57952879e-02 6.77341739e-03 6.88222252e-03 9.01053067e-03
 2.73260634e-03 2.35483498e-03 1.47397957e-03 5.05122774e-03
 5.87942445e-03 5.35338412e-03 1.24390019e-02 7.90198921e-04
 3.08782642e-03 1.53047206e-03 3.89108545e-04 4.62216626e-03
 

- 중요도 기반 데이터프레임 작성하기

In [11]:
random_model_importance = pd.DataFrame({'Feature':X_train.columns, 'random_model':random_model.feature_importances_})
extra_model_importance = pd.DataFrame({'Feature':X_train.columns, 'extra_model':extra_model.feature_importances_})
gbm_model_importance = pd.DataFrame({'Feature':X_train.columns, 'gbm_model':gbm_model.feature_importances_})
xgb_model_importance = pd.DataFrame({'Feature':X_train.columns, 'xgb_model':xgb_model.feature_importances_})

### multiple dataframe 합치기
- dataframes = [각 데이터프레임, ...]
- functools.reduce(lambda  left,right: pd.merge(left, right, on=['동일컬럼']), dataframes)

In [12]:
from functools import reduce
data_frames = [
    random_model_importance,
    extra_model_importance,
    gbm_model_importance,
    xgb_model_importance
]
importances = reduce(lambda  left,right: pd.merge(left, right, on=['Feature']), data_frames)

In [13]:
importances.head()

Unnamed: 0,Feature,random_model,extra_model,gbm_model,xgb_model
0,Pclass_0,0.014715,0.018555,0.014408,0.004698
1,Pclass_1,0.015021,0.017881,0.0,0.004681
2,Pclass_2,0.035811,0.03547,0.114703,0.084025
3,Sex_0,0.070095,0.060969,0.028755,0.010892
4,Sex_1,0.05647,0.08587,0.084164,0.0


- 항목별 평균 중요도 구하기

In [14]:
importances['avg'] = importances.mean(axis=1)

- 중요도 기반 정렬하기

In [15]:
importances = importances.sort_values(by='avg', ascending=False)

### 중요도가 높은 feature 만 선택하기

In [16]:
importances = importances[:50]

- 특정 컬럼만 선택해서, 데이터프레임 만들기

In [17]:
train_importance = X_train[importances['Feature'].tolist()]
test_importance = X_test[importances['Feature'].tolist()]

In [18]:
train_importance.head()

Unnamed: 0,Initial_0,Pclass_2,Sex_1,Sex_0,LowChance_0,HighChance_0,Cabin_8,Fare_0,Ticket_Num_Cut_2,Embarked_0,...,Cabin_3,Family_4,Ticket_Num_Cut_6,Ticket_Num_Cut_3,HighChance_3,HighChance_2,Ticket_initial2_2,Fare_5,Family_3,Cabin_1
0,1,1,0,1,1,1,1,1,0,1,...,0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,0,1,1,0,1,1,1,1,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,1,1,0,1,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0


### 중요도가 높은 feature 로만 머신러닝 적용해보기

In [19]:
knn_model = KNeighborsClassifier()
logreg_model = LogisticRegression()
svc_model = SVC()
decision_model = DecisionTreeClassifier()
random_model = RandomForestClassifier()
extra_model = ExtraTreesClassifier()
gbm_model = GradientBoostingClassifier()
nb_model = GaussianNB()
xgb_model = XGBClassifier(eval_metric='logloss')
lgbm_model = LGBMClassifier()

models = [
    knn_model,
    logreg_model,
    svc_model,
    decision_model,
    random_model,
    extra_model,
    gbm_model,
    nb_model,
    xgb_model,
    lgbm_model
]

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)           # K-Fold 사용
results = dict()
for alg in models:
    alg.fit(train_importance, y_train)
    score = cross_val_score(alg, train_importance, y_train.values.ravel(), cv=k_fold, scoring='accuracy')
    results[alg.__class__.__name__] = np.mean(score)*100

In [20]:
sorted(results.items(), key=lambda x: x[1], reverse=True) # reverse=True 면 높은 순서대로 정렬

[('LogisticRegression', 83.16354556803994),
 ('GradientBoostingClassifier', 82.82646691635455),
 ('SVC', 82.71535580524345),
 ('ExtraTreesClassifier', 82.48689138576779),
 ('RandomForestClassifier', 82.37578027465666),
 ('KNeighborsClassifier', 82.15480649188515),
 ('XGBClassifier', 82.15230961298377),
 ('LGBMClassifier', 81.81523096129837),
 ('DecisionTreeClassifier', 80.24469413233459),
 ('GaussianNB', 78.66541822721598)]

In [21]:
import pickle
with open('titanic_step4_importance_train.pickle', 'wb') as pickle_filename:
    pickle.dump(train_importance, pickle_filename)
with open('titanic_step4_importance_test.pickle', 'wb') as pickle_filename:
    pickle.dump(test_importance, pickle_filename)
with open('titanic_step4_importance_train_y.pickle', 'wb') as pickle_filename:
    pickle.dump(y_train, pickle_filename)