# 特徵挑選

## 作業程式碼

本作業將請學員完成以下要求：
1. 請至 Kaggle 平台找尋欲探索的資料集，進行本次作業。
2. 實作 Exhaustive Search
3. 實作 Sequential Forward/Backward Feature Selection
4. 實作 Sequential Floating Forward/Backward Feature Selection
5. 實作 Recursive Feature Elimination
6. 實作 Recursive Feature Elimination with Cross-Validation

> 注意：由於目前尚未教學建立機器學習模型，資料集請以「預測類別特徵」為主，以利參考範例程式碼進行實作

# Import packages

In [1]:
import warnings

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

warnings.filterwarnings("ignore")

In [2]:
import os

folder = './data/'
path = os.path.join(folder, 'all_video_games.csv')

raw_data = pd.read_csv(path)  # 此行要填入資料路徑
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14055 entries, 0 to 14054
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               14034 non-null  object 
 1   Release Date        13991 non-null  object 
 2   Developer           13917 non-null  object 
 3   Publisher           13917 non-null  object 
 4   Genres              14034 non-null  object 
 5   Genres Splitted     14034 non-null  object 
 6   Product Rating      11005 non-null  object 
 7   User Score          11714 non-null  float64
 8   User Ratings Count  11299 non-null  float64
 9   Platforms Info      14055 non-null  object 
dtypes: float64(2), object(8)
memory usage: 1.1+ MB


In [3]:
from sklearn.preprocessing import LabelEncoder

# 把遺失值過多的欄位排除掉
raw_data = raw_data.dropna(axis=1, thresh=int(round(raw_data.shape[0] * 0.5)))

# 增加類別型欄位的 Label Encoder
category_var1 = 'Genres'
label_encoder = LabelEncoder()
raw_data[category_var1 + '_label_encoded'] = label_encoder.fit_transform(raw_data[category_var1])

# 增加年份欄位
raw_data['Release Date'] = pd.to_datetime(raw_data['Release Date'], format='%m/%d/%Y')
raw_data['Release Year'] = raw_data['Release Date'].dt.year

# 為了方便測試，我們只取 int64, float64 以及 int32 型別的欄位, 並隨機取 500 筆資料
numerics = ['int64', 'float64', 'int32']
raw_data = raw_data.select_dtypes(include=numerics)
raw_data = raw_data.sample(n=500, random_state=1)
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 10703 to 4139
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   User Score            417 non-null    float64
 1   User Ratings Count    409 non-null    float64
 2   Genres_label_encoded  500 non-null    int32  
 3   Release Year          500 non-null    float64
dtypes: float64(3), int32(1)
memory usage: 17.6 KB


In [4]:
raw_data['ID'] = range(1, len(raw_data) + 1)
raw_data.dropna(subset=['User Score'], inplace=True)

pk = 'ID'
target = 'User Score'
remove_features = ['Platforms Info', 'Developer', 'Publisher', 'Genres']

numerical_features = [i for i in raw_data.columns if
                      i not in [pk, target] + remove_features and raw_data[i].dtype != "object"]
classical_features = [i for i in raw_data.columns if
                      i not in [pk, target] + remove_features and raw_data[i].dtype == "object"]

X_train, X_test, y_train, y_test = train_test_split(raw_data[numerical_features + classical_features],
                                                    raw_data[target].astype("int"), test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(333, 3) (84, 3) (333,) (84,)


In [5]:
def onehot_encoding_features(one_train_series,
                             one_test_series):
    # 產生 One-Hot Encoding Object
    onehot_encoder = OneHotEncoder(handle_unknown='ignore')

    # 用訓練資料配適 One-Hot Encoding
    onehot_encoder = onehot_encoder.fit(one_train_series.values.reshape((-1, 1)))

    # 產生 One-Hot Encoding 的資料型態
    encoded_train = pd.DataFrame(onehot_encoder.transform(one_train_series.values.reshape((-1, 1))).toarray(),
                                 columns=onehot_encoder.categories_[0].tolist())
    encoded_test = pd.DataFrame(onehot_encoder.transform(one_test_series.values.reshape((-1, 1))).toarray(),
                                columns=onehot_encoder.categories_[0].tolist())

    return encoded_train, encoded_test


# 把類別資料轉成 One-Hot Encoding
encoded_data = [
    onehot_encoding_features(one_train_series=X_train[one_column], one_test_series=X_test[one_column]) for
    one_column in classical_features]

# 建立 One-Hot Encoding 後的訓練資料
preprocessed_X_train = pd.concat([X_train.reset_index(drop=True)] + [
    data[0] for data in encoded_data
], axis=1).drop(columns=classical_features)

# 建立 One-Hot Encoding 後的測試資料
preprocessed_X_test = pd.concat([X_test.reset_index(drop=True)] + [
    data[1] for data in encoded_data
], axis=1).drop(columns=classical_features)

print(preprocessed_X_train.shape, preprocessed_X_test.shape)

(333, 3) (84, 3)


# Exhaustive Feature Selection
評估指標參考連結：https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 

In [6]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris

### 建立模型物件 ###
iris = load_iris()
X = iris.data
y = iris.target

knn = KNeighborsClassifier(n_neighbors=3)

### 建立特徵挑選物件 ###
efs = ExhaustiveFeatureSelector(knn,
                                min_features=1,
                                max_features=4,
                                scoring='accuracy',
                                print_progress=True,
                                cv=5)

### 開始執行特徵挑選 ###
efs = efs.fit(X, y)

# 輸出最好的特徵組合
print('Best accuracy score: %.2f' % efs.best_score_)
print('Best subset (indices):', efs.best_idx_)
print('Best subset (corresponding names):', efs.best_feature_names_)

Features: 15/15

Best accuracy score: 0.97
Best subset (indices): (0, 2, 3)
Best subset (corresponding names): ('0', '2', '3')


# Sequential Forward Selection

程式碼參考連結：https://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/#overview    
評估指標參考連結：https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 

In [7]:

from mlxtend.feature_selection import SequentialFeatureSelector

### 建立模型物件 ###
model = XGBRegressor()

### 建立特徵挑選物件 ###
sfs = SequentialFeatureSelector(model,
                                k_features=2,
                                forward=True,
                                floating=False,
                                cv=0)

### 開始執行特徵挑選 ###
sfs.fit(preprocessed_X_train, y_train)

In [8]:
# 輸出每輪特徵挑選狀況
sfs.subsets_

{1: {'feature_idx': (0,),
  'cv_scores': array([0.56504338]),
  'avg_score': 0.5650433785622497,
  'feature_names': ('User Ratings Count',)},
 2: {'feature_idx': (0, 1),
  'cv_scores': array([0.95553632]),
  'avg_score': 0.9555363181813552,
  'feature_names': ('User Ratings Count', 'Genres_label_encoded')}}

In [9]:
# 輸出被選入的特徵
sfs.k_feature_names_

('User Ratings Count', 'Genres_label_encoded')

# Sequential Backward Selection

In [10]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [11]:
model = XGBRegressor()

# 建立特徵挑選物件
sbs = SequentialFeatureSelector(model,
                                k_features=2,
                                forward=False,
                                floating=False,
                                cv=0)

# 開始執行特徵挑選
sbs.fit(preprocessed_X_train, y_train)

In [12]:
# 輸出特徵挑選過程
sbs.subsets_

{3: {'feature_idx': (0, 1, 2),
  'cv_scores': array([0.99688518]),
  'avg_score': 0.9968851777304926,
  'feature_names': ('User Ratings Count',
   'Genres_label_encoded',
   'Release Year')},
 2: {'feature_idx': (0, 1),
  'cv_scores': array([0.95553632]),
  'avg_score': 0.9555363181813552,
  'feature_names': ('User Ratings Count', 'Genres_label_encoded')}}

In [13]:
# 輸出被選入的特徵
sbs.k_feature_names_

('User Ratings Count', 'Genres_label_encoded')

# Sequential Floating Forward Selection

In [14]:
from mlxtend.feature_selection import SequentialFeatureSelector

model = XGBRegressor()

# 建立特徵挑選物件
sffs = SequentialFeatureSelector(estimator=model,
                                 k_features=2,
                                 scoring="neg_mean_squared_error",
                                 cv=0,
                                 floating=True,
                                 forward=True)

# 開始執行特徵挑選
sffs.fit(preprocessed_X_train, y_train)

In [15]:
# 輸出特徵挑選過程
sffs.subsets_

{1: {'feature_idx': (0,),
  'cv_scores': array([-0.80362383]),
  'avg_score': -0.8036238282149123,
  'feature_names': ('User Ratings Count',)},
 2: {'feature_idx': (0, 1),
  'cv_scores': array([-0.08215089]),
  'avg_score': -0.08215089146480084,
  'feature_names': ('User Ratings Count', 'Genres_label_encoded')}}

In [16]:
# 輸出被選入的特徵
sffs.k_feature_names_

('User Ratings Count', 'Genres_label_encoded')

# Sequential Floating Backward Selection

In [17]:
from mlxtend.feature_selection import SequentialFeatureSelector

model = XGBRegressor()

# 建立特徵挑選物件
sfbs = SequentialFeatureSelector(estimator=model,
                                 k_features=2,
                                 floating=True,
                                 cv=0,
                                 forward=False)

# 開始執行特徵挑選
sfbs.fit(preprocessed_X_train, y_train)

In [18]:
# 輸出被挑選到的特徵
sfbs.k_feature_names_

('User Ratings Count', 'Genres_label_encoded')

# Recursive Feature Elimination

In [19]:
from sklearn.feature_selection import RFE

In [20]:
model = XGBRegressor()

# 建立 RFE 物件
rfe = RFE(estimator=model,
          n_features_to_select=5,
          step=5)

# 執行 RFE
rfe.fit(preprocessed_X_train, y_train)

In [21]:
# 輸出被選入的特徵
select_index = rfe.get_feature_names_out()
print(select_index)

['User Ratings Count' 'Genres_label_encoded' 'Release Year']


# Recursive Feature Elimination with Cross-Validation

In [22]:
from sklearn.feature_selection import RFECV

In [23]:
model = XGBRegressor()

# 建立 RFECV 物件
rfecv = RFECV(estimator=model,
              min_features_to_select=5,
              step=5,
              cv=5,
              scoring="neg_mean_squared_error",
              verbose=1)

# 執行 RFECV
rfecv.fit(preprocessed_X_train, y_train)

In [24]:
rfecv.get_feature_names_out()

array(['User Ratings Count', 'Genres_label_encoded', 'Release Year'],
      dtype=object)

In [25]:
rfecv.cv_results_

{'mean_test_score': array([-2.09542274]),
 'std_test_score': array([0.44614391]),
 'split0_test_score': array([-1.90587697]),
 'split1_test_score': array([-1.7648923]),
 'split2_test_score': array([-2.46555902]),
 'split3_test_score': array([-1.57802369]),
 'split4_test_score': array([-2.76276173])}