# 機械学習をPythonで実践する-20　　～ 特徴量選択 ～

In [1]:
%load_ext autoreload
%autoreload 2
import polars as pl
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OrdinalEncoder, LabelEncoder, OneHotEncoder
# # import statsmodels.api as sma
from sklearn.model_selection import train_test_split ,cross_val_score, KFold, RepeatedKFold,StratifiedKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, log_loss, confusion_matrix,ConfusionMatrixDisplay, \
accuracy_score, precision_score, recall_score,precision_recall_curve,f1_score,roc_curve,auc,get_scorer_names,roc_auc_score
from sklearn.compose import ColumnTransformer
# from sklearn import tree
# from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from lightGBM_cv import lightGBM_classifier_cv_func
from category_encoders import TargetEncoder
from sklearn.linear_model import LogisticRegression

%matplotlib inline
import matplotlib.pyplot as plt


## Greedy Feature Selection
Kaggleのpenguinデータセットを使ってGreedy Feature Selectionをやってみる。  
GBDTではあまり積極的に特徴量選択を行わない※ことから、モデルにはロジスティック回帰を使う。  

※GBDTはノイズに強いため、元々ある特徴量は全て使い、特徴量エンジニアリングで増やした特徴量の効果をCVで検証して採用するかどうか決めることが多い様子。  

In [2]:
dtypes = {
    "species": str,
    'island': str,
    'culmen_length_mm': pl.Float32, # くちばしの長さ[mm]
    'culmen_depth_mm': pl.Float32, # くちばしの高さ[mm]
    'flipper_length_mm': pl.Float32, # 翼の長さ[mm]
    'body_mass_g': pl.Float32, # 体重[g]
    'sex': str
}

# ペンギンのデータセット読み込み。欠損値がNAとして含まれているので、null_values="NA"を指定しないと読み込みエラーになる。
df = pl.read_csv('../Python/sample_data/ML_sample/penguins_size.csv',dtypes=dtypes, null_values='NA')


### - 前処理

In [3]:
df.null_count()

species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
u32,u32,u32,u32,u32,u32,u32
0,0,2,2,2,2,10


In [4]:
# sexカラムの.は欠損値扱いとする。
df = df.with_columns(
    (pl.when(pl.col('sex') == '.').then(None).otherwise(pl.col('sex'))).alias('sex')
)

In [5]:
# 欠損値が多すぎる行（値が入っている列が3つより少ない行）を削除する。
# この操作はPolarsだと面倒なので、一回Pandasに変換してやる。
df = pl.from_pandas(df.to_pandas().dropna(thresh=3))

In [6]:
df.null_count()

species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,9


In [7]:
target = 'species'
X = df.drop(target)
y = df.get_column(target)

In [8]:
# sexカラムの欠損値を文字列の'NaN'で置き換える
X = X.fill_null('NaN')
X.null_count()

island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
u32,u32,u32,u32,u32,u32
0,0,0,0,0,0


今回は欠損値を新たなカテゴリ'NaN'として扱うだけなので、欠損値対応を交差検証の中で行う必要はない。  
この段階でOne-hot Encodingによるダミー変数化を行うと、greedy feature selectionによる特徴量選択がダミー変数も含む状態で行われ、  
計算量が多くなる（多分）、かつ元の特徴量を部分的に使うことになってしまいわかりづらくなってしまうので、  
ダミー変数化はpipelineにしてgreedy feature selectionのCVの中で行う。

### - 特徴量エンジニアリング

ここでは多項式特徴量と四則演算した結果を用いる。

In [9]:
# 多項式特徴量
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_length_depth = poly.fit_transform(X.select(['culmen_length_mm', 'culmen_depth_mm']))

In [10]:
X = X.with_columns([
    pl.Series(poly_length_depth[:, 0]).alias('culmen_length_mm'),
    pl.Series(poly_length_depth[:, 1]).alias('culmen_depth_mm'),
    pl.Series(poly_length_depth[:, 2]).alias('culmen_length_mm^2'),
    pl.Series(poly_length_depth[:, 3]).alias('culmen_length_X_depth'),
    pl.Series(poly_length_depth[:, 4]).alias('culmen_depth_mm^2')
])

In [11]:
# culmen_length_mm, culmen_depth_mmの差と比を計算する
X = X.with_columns([
    (pl.col('culmen_length_mm') - pl.col('culmen_depth_mm')).alias('culmen_diff'),
    (pl.col('culmen_length_mm') / pl.col('culmen_depth_mm')).alias('culmen_ratio'),
])

In [12]:
X.head(3)

island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,culmen_length_mm^2,culmen_length_X_depth,culmen_depth_mm^2,culmen_diff,culmen_ratio
str,f64,f64,f32,f32,str,f64,f64,f64,f64,f64
"""Torgersen""",39.099998,18.700001,181.0,3750.0,"""MALE""",1528.809881,731.170001,349.690029,20.399998,2.090909
"""Torgersen""",39.5,17.4,186.0,3800.0,"""FEMALE""",1560.25,687.299985,302.759987,22.1,2.270115
"""Torgersen""",40.299999,18.0,195.0,3250.0,"""FEMALE""",1624.089939,725.399986,324.0,22.299999,2.238889


### - CVの準備

In [15]:
# cvインスタンスを生成。3 fold で評価する。
cv = KFold(n_splits=3, random_state=0, shuffle=True)

In [19]:
# 一連の学習・予測用のPipelineを定義。
pipe = Pipeline(steps=[('dummies', OneHotEncoder(drop='first',handle_unknown='ignore', sparse_output=False)),
                       ('std_scale', StandardScaler()),
                       ('model', LogisticRegression())])

In [27]:
ohe = OneHotEncoder(drop='first',handle_unknown='ignore', sparse_output=False)

In [32]:
X

island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,culmen_length_mm^2,culmen_length_X_depth,culmen_depth_mm^2,culmen_diff,culmen_ratio
str,f64,f64,f32,f32,str,f64,f64,f64,f64,f64
"""Torgersen""",39.099998,18.700001,181.0,3750.0,"""MALE""",1528.809881,731.170001,349.690029,20.399998,2.090909
"""Torgersen""",39.5,17.4,186.0,3800.0,"""FEMALE""",1560.25,687.299985,302.759987,22.1,2.270115
"""Torgersen""",40.299999,18.0,195.0,3250.0,"""FEMALE""",1624.089939,725.399986,324.0,22.299999,2.238889
"""Torgersen""",36.700001,19.299999,193.0,3450.0,"""FEMALE""",1346.890056,708.309987,372.489971,17.400002,1.901555
"""Torgersen""",39.299999,20.6,190.0,3650.0,"""MALE""",1544.48994,809.579999,424.360016,18.699999,1.907767
"""Torgersen""",38.900002,17.799999,181.0,3625.0,"""FEMALE""",1513.210119,692.419997,316.839973,21.100002,2.185393
"""Torgersen""",39.200001,19.6,195.0,4675.0,"""MALE""",1536.64006,768.32003,384.160015,19.6,2.0
"""Torgersen""",34.099998,18.1,193.0,3475.0,"""NaN""",1162.809896,617.209985,327.610014,15.999998,1.883978
"""Torgersen""",42.0,20.200001,190.0,4250.0,"""NaN""",1764.0,848.400032,408.040031,21.799999,2.079208
"""Torgersen""",37.799999,17.1,186.0,3300.0,"""NaN""",1428.839942,646.380001,292.410013,20.699999,2.210526


In [31]:
X.to_dummies()

DuplicateError: unable to hstack, column with name "culmen_ratio_2.64" already exists

In [29]:
ohe.set_output(transform='pandas')
ohe.fit_transform(X)

Unnamed: 0,island_Dream,island_Torgersen,culmen_length_mm_33.099998474121094,culmen_length_mm_33.5,culmen_length_mm_34.0,culmen_length_mm_34.099998474121094,culmen_length_mm_34.400001525878906,culmen_length_mm_34.5,culmen_length_mm_34.599998474121094,culmen_length_mm_35.0,...,culmen_ratio_3.4397162189966237,culmen_ratio_3.4437499046325684,culmen_ratio_3.4444444444444446,culmen_ratio_3.445255578107285,culmen_ratio_3.453900669885171,culmen_ratio_3.4585987195373185,culmen_ratio_3.492424177291526,culmen_ratio_3.5058822631835938,culmen_ratio_3.510489517018651,culmen_ratio_3.6126760511354803
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
pipe.fit(X,y)

### - Greedy feature selectionクラスの定義

In [None]:
class GreedyFeatureSelection:
    def __init__(self,) -> None:
        