## Greedy feature selection

In [48]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [6]:
df = pd.read_csv('document/penguins_size.csv')
df.loc[df[df['sex'] == '.'].index, 'sex'] = np.nan

In [7]:
# threshを指定することでその数以上の特徴量データを持っていないとドロップする
df.dropna(thresh=3, inplace=True)

In [53]:
X = df.drop('species', axis=1)
y = df['species']

# 欠損値を新しいカテゴリ'NaN'に代入
imputer = SimpleImputer(strategy='constant', fill_value='NaN')
imputer.set_output(transform='pandas')
X['sex'] = imputer.fit_transform(X[['sex']])


# pipelineのためのダミー変数クラス
class GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.X_columns = None
    
    def fit(self, X, y):
        self.X_columns = pd.get_dummies(X).columns.to_list()
        return self
    
    def transform(self, X):
        X_dummies = pd.get_dummies(X)
        return X_dummies.reindex(columns=self.X_columns, fill_value=0)
    
    
# 特徴量エンジニアリング
# 多項式特徴量
poly = PolynomialFeatures(degree=2, include_bias=False)
X[['culmen_length_mm', 'culmen_depth_mm', 'culmen_length_mm^2', 'culmen_length_mm * culmen_length_mm', 'culmen_depth_mm^2',]] = poly.fit_transform(X[['culmen_length_mm', 'culmen_depth_mm']])

# 四則演算
X['culmen_diff'] = X['culmen_length_mm'] - X['culmen_depth_mm']
X['culmen_ratio'] = X['culmen_length_mm'] / X['culmen_depth_mm']


# Pipeline
pipeline =Pipeline(steps=[('dummy', GetDummies()),
                          ('scaler', StandardScaler()),
                          ('model', LogisticRegression())])


# cv
cv = KFold(n_splits=3, random_state=0, shuffle=True)




In [213]:
# Greedy feature selection
# gfs = GreedyFeatureSElection(Piplene=pipline, cv=cv)
# gfs.select_feature(X, y)
# gfs.scores -> 各最高精度が見れる
# gfs.selected_features -> 選択された特徴量が見れる


class GreedyFeatureSelection:
    def __init__(self, pipeline, cv):
        self.pipeline = pipeline
        self.cv = cv
        self.scores = []
        self.selected_features = []
        
    
    def select_feature(self, X, y):
        
        while True:
            # Xのカラム名リスト
            X_cols = X.columns.to_list()
            
            # 最初にiterとそれ以降のiterで分岐
            if len(self.scores) == 0:
                selected_cols = X_cols
            else:
                # self.selected_featuresに入っていないカラムを抽出
                selected_cols = [col for col in X_cols if col not in self.selected_features]

            # 各カラムを使用した精度を格納
            scores = {}
            for col in selected_cols:
                # 格納したカラム名との結合 -> list型
                cols = [col] + self.selected_features
                selected_col_X = X[cols]
                score = cross_val_score(self.pipeline, selected_col_X, y, cv=self.cv, scoring='accuracy')
                scores[col] = score.mean()

            # 一番高い精度のcolを選出
            best_col = max(scores,  key=scores.get)
            best_score = scores[best_col]
            self.selected_features.append(best_col)
            self.scores.append(best_score)
            
            # bestscoreが向上していなかったらbreak
            # 初回のiterは回避
            if len(self.scores) > 1:
                if best_score <= self.scores[-2]:
                    self.selected_features.pop(-1)
                    self.scores.pop(-1)
                    break

            

    

In [214]:
gfs = GreedyFeatureSelection(pipeline=pipeline, cv=cv)
gfs.select_feature(X, y)

In [215]:
gfs.selected_features

['culmen_ratio', 'island', 'culmen_diff', 'body_mass_g']

In [216]:
gfs.scores

[0.9619883040935672, 0.9912280701754387, 0.9970760233918128, 1.0]