# 特徵挑選 - 範例程式碼

本範例將使用 Kaggle 平台競賽中的 Spaceship Titanic 提供的資料集，連結如下：https://www.kaggle.com/competitions/spaceship-titanic 

本程式碼將詳細介紹以下數個知識點：
1. Exhaustive Search
2. Sequential Forward/Backward Feature Selection
3. Sequential Floating Forward/Backward Feature Selection
4. Recursive Feature Elimination
5. Recursive Feature Elimination with Cross-Validation

In [1]:
!pip install mlxtend==0.20.0

Collecting mlxtend==0.20.0
  Obtaining dependency information for mlxtend==0.20.0 from https://files.pythonhosted.org/packages/45/89/492924d6fc2cc9524f90febd0e9f7487c02261a8689c7c97348b09d0d071/mlxtend-0.20.0-py2.py3-none-any.whl.metadata
  Downloading mlxtend-0.20.0-py2.py3-none-any.whl.metadata (1.6 kB)
Downloading mlxtend-0.20.0-py2.py3-none-any.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.3 MB 2.6 MB/s eta 0:00:01
   --------- ------------------------------ 0.3/1.3 MB 3.8 MB/s eta 0:00:01
   -------------------- ------------------- 0.7/1.3 MB 5.3 MB/s eta 0:00:01
   ---------------------------------------- 1.3/1.3 MB 7.8 MB/s eta 0:00:00
Installing collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.23.1
    Uninstalling mlxtend-0.23.1:
      Successfully uninstalled mlxtend-0.23.1
Successfully installed mlxtend-0.20.0


In [4]:
pip install xgboost

Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/30/7d/41847e45ff075f3636c95d1000e0b75189aed4f1ae18c36812575bb42b4b/xgboost-3.1.2-py3-none-win_amd64.whl.metadata
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB 960.0 kB/s eta 0:01:15
   ---------------------------------------- 0.2/72.0 MB 2.8 MB/s eta 0:00:27
   ---------------------------------------- 0.5/72.0 MB 4.2 MB/s eta 0:00:18
   ---------------------------------------- 0.9/72.0 MB 5.1 MB/s eta 0:00:15
    --------------------------------------- 1.8/72.0 MB 8.1 MB/s eta 0:00:09
   -- ------------------------------------- 4.6/72.0 MB 17.1 MB/s eta 0:00:04
   --- ------------------------------------ 6.9

# Import packages

In [5]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [6]:
file_path = r"D:\Github\ML100Days\train.xlsx"
raw_data = pd.read_excel(file_path)
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   float64
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   float64
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(8), object(5)
memory usage: 891.5+ KB


In [7]:
raw_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,0.0,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,0.0,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,0.0,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,0.0,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,0.0,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [8]:
# 把遺失值過多的欄位排除掉
raw_data = raw_data.dropna()

In [9]:
# Define Features and Target
PK = "PassengerId"
target = "Transported"
remove_features = ["Destination", "Name", "Cabin"]
numerical_features = [i for i in raw_data.columns if i not in [PK, target]+remove_features and raw_data[i].dtype != "object"]
classical_features = [i for i in raw_data.columns if i not in [PK, target]+remove_features and raw_data[i].dtype == "object"]

In [10]:
# 切割成訓練、驗證與測試資料
xtrain, xtest, ytrain, ytest = train_test_split(raw_data[numerical_features+classical_features], raw_data[target].astype("int"), test_size = 0.2)

In [11]:
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(5284, 9) (1322, 9) (5284,) (1322,)


In [12]:
def generate_one_hot_encoding_features(one_train_Series,
                                       one_test_Series):
    
    # 產生 One-Hot Encoding Object
    oneHotEncoding = OneHotEncoder(handle_unknown = "ignore")

    # 用訓練資料配適 One-Hot Encoding
    oneHotEncoding = oneHotEncoding.fit(one_train_Series.values.reshape((-1, 1)))

    # 產生 One-Hot Encoding 的資料型態
    oneHot_train_data = pd.DataFrame(oneHotEncoding.transform(one_train_Series.values.reshape((-1, 1))).toarray(), columns = oneHotEncoding.categories_[0].tolist() )
    oneHot_test_data = pd.DataFrame(oneHotEncoding.transform(one_test_Series.values.reshape((-1, 1))).toarray(), columns = oneHotEncoding.categories_[0].tolist() )

    return oneHot_train_data, oneHot_test_data

In [13]:
# 把類別資料轉成 One-Hot Encoding
OneHotEncoding_data = [generate_one_hot_encoding_features(one_train_Series = xtrain[one_column], one_test_Series = xtest[one_column]) for one_column in classical_features]

# 建立 One-Hot Encoding 後的訓練資料
preprocessed_xtrain = pd.concat([xtrain.reset_index(drop = True)] + [
    data[0] for data in OneHotEncoding_data
], axis = 1).drop(columns = classical_features)

# 建立 One-Hot Encoding 後的測試資料
preprocessed_xtest = pd.concat([xtest.reset_index(drop = True)]+[
    data[1] for data in OneHotEncoding_data
], axis = 1).drop(columns = classical_features)

In [14]:
print(preprocessed_xtrain.shape, preprocessed_xtest.shape)

(5284, 11) (1322, 11)


# Exhaustive Feature Selection
評估指標參考連結：https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 

In [15]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

iris = load_iris()
X = iris.data
y = iris.target

knn = KNeighborsClassifier(n_neighbors=3)

efs1 = EFS(knn, 
           min_features=1,
           max_features=4,
           scoring='accuracy',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(X, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

Features: 15/15

Best accuracy score: 0.97
Best subset (indices): (0, 2, 3)
Best subset (corresponding names): ('0', '2', '3')


In [17]:
y, ytrain.values

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 array([0, 1, 0, ..., 0, 0, 0]))

In [18]:
model = XGBClassifier()

# 建立特徵挑選物件
efs = ExhaustiveFeatureSelector(model,
                 min_features = 1,
                 max_features = 3,
                 scoring = "accuracy",
                 print_progress = True,
                 cv = 5)

# 開始執行特徵挑選
efs.fit(preprocessed_xtrain, ytrain)

Features: 231/231

In [19]:
# 輸出最好的特徵組合
efs.best_feature_names_ 

('RoomService', 'Spa', 'VRDeck')

# Sequential Forward Selection

程式碼參考連結：http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/#overview    
評估指標參考連結：https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 

In [20]:
# from sklearn.feature_selection import SequentialFeatureSelector
from mlxtend.feature_selection import SequentialFeatureSelector

In [21]:
model = XGBClassifier()

# 建立特徵挑選物件
sfs = SequentialFeatureSelector(model, 
                 k_features = 10,
                 forward = True,
                 floating = False,
                 cv = 0)

# 開始執行特徵挑選
sfs.fit(preprocessed_xtrain, ytrain)                          

In [22]:
# 輸出每一輪特徵挑選狀況
sfs.subsets_

{1: {'feature_idx': (0,),
  'cv_scores': array([0.71971991]),
  'avg_score': 0.7197199091597275,
  'feature_names': ('CryoSleep',)},
 2: {'feature_idx': (0, 4),
  'cv_scores': array([0.748486]),
  'avg_score': 0.7484859954579863,
  'feature_names': ('CryoSleep', 'FoodCourt')},
 3: {'feature_idx': (0, 1, 4),
  'cv_scores': array([0.79750189]),
  'avg_score': 0.7975018925056775,
  'feature_names': ('CryoSleep', 'Age', 'FoodCourt')},
 4: {'feature_idx': (0, 1, 4, 5),
  'cv_scores': array([0.83573051]),
  'avg_score': 0.8357305071915215,
  'feature_names': ('CryoSleep', 'Age', 'FoodCourt', 'ShoppingMall')},
 5: {'feature_idx': (0, 1, 4, 5, 6),
  'cv_scores': array([0.86260409]),
  'avg_score': 0.8626040878122634,
  'feature_names': ('CryoSleep', 'Age', 'FoodCourt', 'ShoppingMall', 'Spa')},
 6: {'feature_idx': (0, 1, 4, 5, 6, 7),
  'cv_scores': array([0.87717638]),
  'avg_score': 0.8771763815291446,
  'feature_names': ('CryoSleep',
   'Age',
   'FoodCourt',
   'ShoppingMall',
   'Spa',
   '

In [23]:
# 輸出被選入的特徵
sfs.k_feature_names_

('CryoSleep',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Earth',
 'Europa')

# Sequential Backward Selection

In [24]:
# from sklearn.feature_selection import SequentialFeatureSelector
from mlxtend.feature_selection import SequentialFeatureSelector

In [25]:
model = XGBClassifier()

# 建立特徵挑選物件
sbs = SequentialFeatureSelector(model,
                 k_features = 5,
                 forward = False,
                 floating = False,
                 cv = 0)

# 開始執行特徵挑選
sbs.fit(preprocessed_xtrain, ytrain)

In [26]:
# 輸出特徵挑選過程
sbs.subsets_

{11: {'feature_idx': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
  'cv_scores': array([0.89061317]),
  'avg_score': 0.8906131718395155,
  'feature_names': ('CryoSleep',
   'Age',
   'VIP',
   'RoomService',
   'FoodCourt',
   'ShoppingMall',
   'Spa',
   'VRDeck',
   'Earth',
   'Europa',
   'Mars')},
 10: {'feature_idx': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
  'cv_scores': array([0.89155942]),
  'avg_score': 0.891559424678274,
  'feature_names': ('CryoSleep',
   'Age',
   'VIP',
   'RoomService',
   'FoodCourt',
   'ShoppingMall',
   'Spa',
   'VRDeck',
   'Earth',
   'Europa')},
 9: {'feature_idx': (0, 1, 3, 4, 5, 6, 7, 8, 9),
  'cv_scores': array([0.89042392]),
  'avg_score': 0.8904239212717638,
  'feature_names': ('CryoSleep',
   'Age',
   'RoomService',
   'FoodCourt',
   'ShoppingMall',
   'Spa',
   'VRDeck',
   'Earth',
   'Europa')},
 8: {'feature_idx': (0, 1, 3, 4, 5, 6, 7, 8),
  'cv_scores': array([0.89042392]),
  'avg_score': 0.8904239212717638,
  'feature_names': ('CryoSleep',
   'Age',


In [27]:
# 輸出被選入的特徵
sbs.k_feature_names_

('RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck')

# Sequential Floating Forward Selection

In [28]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [29]:
model = XGBClassifier()

# 建立特徵挑選物件
sffs = SequentialFeatureSelector(estimator = model,
                  k_features = 5,
                  scoring = "neg_mean_squared_error",
                  cv = 0,
                  floating = True,
                  forward = True)

# 開始執行特徵挑選
sffs.fit(preprocessed_xtrain, ytrain)

In [30]:
# 輸出特徵挑選過程
sffs.subsets_

{1: {'feature_idx': (0,),
  'cv_scores': array([-0.28028009]),
  'avg_score': -0.2802800908402725,
  'feature_names': ('CryoSleep',)},
 2: {'feature_idx': (0, 4),
  'cv_scores': array([-0.251514]),
  'avg_score': -0.2515140045420136,
  'feature_names': ('CryoSleep', 'FoodCourt')},
 3: {'feature_idx': (0, 1, 4),
  'cv_scores': array([-0.20249811]),
  'avg_score': -0.20249810749432248,
  'feature_names': ('CryoSleep', 'Age', 'FoodCourt')},
 4: {'feature_idx': (0, 1, 4, 5),
  'cv_scores': array([-0.16426949]),
  'avg_score': -0.1642694928084784,
  'feature_names': ('CryoSleep', 'Age', 'FoodCourt', 'ShoppingMall')},
 5: {'feature_idx': (0, 1, 4, 5, 6),
  'cv_scores': array([-0.13739591]),
  'avg_score': -0.13739591218773656,
  'feature_names': ('CryoSleep', 'Age', 'FoodCourt', 'ShoppingMall', 'Spa')}}

In [31]:
# 輸出被選入的特徵
sffs.k_feature_names_

('CryoSleep', 'Age', 'FoodCourt', 'ShoppingMall', 'Spa')

# Sequential Floating Backward Selection

In [32]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [33]:
model = XGBClassifier()

# 建立特徵挑選物件
sfbs = SequentialFeatureSelector(estimator = model,
                 k_features = 5,
                 floating = True,
                 cv = 0,
                 forward = False)

# 開始執行特徵挑選
sfbs.fit(preprocessed_xtrain, ytrain)

In [34]:
# 輸出被挑選到的特徵
sfbs.k_feature_names_

('RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck')

# Recursive Feature Elimination

In [35]:
from sklearn.feature_selection import RFE

In [36]:
model = XGBClassifier()

# 建立 RFE 物件
rfe = RFE(estimator = model, 
      n_features_to_select = 5,
      step = 5)

# 執行 RFE
rfe.fit(preprocessed_xtrain, ytrain)

In [37]:
# 輸出被選入的特徵
select_index = rfe.get_feature_names_out()
print(select_index)

['CryoSleep' 'Spa' 'VRDeck' 'Earth' 'Europa']


# Recursive Feature Elimination with Cross-Validation

In [71]:
from sklearn.feature_selection import RFECV

In [73]:
model = XGBClassifier()

# 建立 RFECV 物件
rfecv = RFECV(estimator = model,
        min_features_to_select = 5,
        step = 5,
        cv = 5,
        scoring = "accuracy",
        verbose = 1)

# 執行 RFECV
rfecv.fit(preprocessed_xtrain, ytrain)

Fitting estimator with 11 features.
Fitting estimator with 6 features.
Fitting estimator with 11 features.
Fitting estimator with 6 features.
Fitting estimator with 11 features.
Fitting estimator with 6 features.
Fitting estimator with 11 features.
Fitting estimator with 6 features.
Fitting estimator with 11 features.
Fitting estimator with 6 features.


In [74]:
rfecv.get_feature_names_out()

array(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'Earth', 'Europa', 'Mars'],
      dtype=object)

In [75]:
rfecv.cv_results_

{'mean_test_score': array([0.74072471, 0.75245657, 0.78103445]),
 'std_test_score': array([0.01032908, 0.01372396, 0.0090155 ]),
 'split0_test_score': array([0.75969726, 0.77578051, 0.7833491 ]),
 'split1_test_score': array([0.73888363, 0.74834437, 0.78807947]),
 'split2_test_score': array([0.74172185, 0.75307474, 0.79186377]),
 'split3_test_score': array([0.73320719, 0.75212867, 0.77483444]),
 'split4_test_score': array([0.73011364, 0.73295455, 0.76704545])}