In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from scipy.stats import uniform
from scipy.stats import randint as sp_randint

import lightgbm as lgb
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
import plotly.express as px

## Grid Search

In [2]:
# データロード
dataset = fetch_california_housing()
# データの説明
print(dataset['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [None]:
# データ準備
X = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
y = dataset['target']

# hold-out
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# モデル
lgbmr = lgb.LGBMRegressor(random_state=0)
param_grid = {'num_leaves': [10, 20, 30, 40, 50, 60],
              'max_depth': [5, 10, 15], 
              'reg_alpha': [0, 0.01, 0.03]}

cv = KFold(n_splits=3, random_state=0, shuffle=True)
gs = GridSearchCV(lgbmr, param_grid=param_grid, cv=cv)

In [None]:
# early_stopping
callbacks = [lgb.early_stopping(stopping_rounds=10)]
eval_set = [(X_val, y_val)]
fit_params = {'callbacks': callbacks, 'eval_set': eval_set}
gs.fit(X_train, y_train, **fit_params)

In [None]:
# grid searchの結果確認
cv_results_df = pd.DataFrame(gs.cv_results_)
cv_results_df.head(3)

In [None]:
# top5の結果を表示
for index, row in cv_results_df[cv_results_df['rank_test_score']<=5].iterrows():
    print(f'{row["rank_test_score"]}: {row["params"]}')
    print(f'{row["mean_test_score"]}')
    

In [None]:
# 最も良いハイパーパラメータの組み合わせで再度学習し評価
lgbmr = lgb.LGBMRegressor(**gs.best_params_, random_state=0, learning_rate=0.01, n_estimators=1000)
callbacks = [lgb.early_stopping(stopping_rounds=10)]
eval_set = [(X_val, y_val)]
fit_params = {'callbacks': callbacks, 'eval_set': eval_set}
lgbmr.fit(X_train, y_train, **fit_params)

In [None]:
lgbmr.score(X_val, y_val)

## Pipeline + Grid Search

In [None]:
# データロード
df = pd.read_csv('penguins_size.csv')
# データクリーニング
df.loc[df[df['sex']=='.'].index[0], 'sex'] = np.nan
df.dropna(thresh=3, inplace=True)
# X, y作成
X = df.drop('species', axis=1)
y = df['species']

# 欠損値代入
cat_cols = X.select_dtypes(exclude=np.number).columns.to_list()
imputer = SimpleImputer(strategy='most_frequent')
ct = ColumnTransformer([('cat_imputer', imputer, cat_cols)], remainder='passthrough')
ct.set_output(transform='pandas')

# OHE
class GetDummies(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.columns = None
        
    def fit(self, X, y=None):
        self.columns = pd.get_dummies(X).columns
        return self
    
    def transform(self, X):
        X_new = pd.get_dummies(X)
        return X_new.reindex(columns=self.columns, fill_value=0)

# モデル
lgbmc = lgb.LGBMClassifier(random_state=0)
    
# pipeline
pipeline = Pipeline([('impute',ct), ('dummy', GetDummies()), ('model', lgbmc)])

# cv
cv = KFold(n_splits=3, random_state=0, shuffle=True)

# grid search
# pipelineを使う場合はキーの接頭辞にtransformerの名前をつけることに注意
param_grid = {'model__num_leaves': [10, 20, 30, 40, 50, 60],
              'model__max_depth': [5, 10, 15], 
              'model__reg_alpha': [0, 0.01, 0.03]}
gs = GridSearchCV(pipeline, param_grid=param_grid, cv=cv)
gs.fit(X, y)

In [None]:
cv_results_df = pd.DataFrame(gs.cv_results_)

## Random Search CV

In [None]:
# データ準備
dataset = fetch_california_housing()
X = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
y = dataset['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# モデル
lgbmr = lgb.LGBMRegressor(random_state=0)
param_dist = {'num_leaves': sp_randint(10, 60),
              'max_depth': sp_randint(5, 15), 
              'reg_alpha': uniform(0, 0.03)}

eval_set = [(X_val, y_val)]
callbacks = [lgb.early_stopping(stopping_rounds=10)]
fit_params = {'callbacks': callbacks, 'eval_set': eval_set}

# cv
cv = KFold(n_splits=3, shuffle=True, random_state=0)

# random search
rs = RandomizedSearchCV(lgbmr, param_distributions=param_dist, cv=cv, n_iter=36)
rs.fit(X_train, y_train, **fit_params)

In [None]:
# random searchの結果確認
cv_results_df = pd.DataFrame(rs.cv_results_)
cv_results_df.head(3)

In [None]:
# top5を表示
for _, row in cv_results_df[cv_results_df['rank_test_score']<=5].iterrows():
    print(f"{row['rank_test_score']}: {row['params']}")
    print(f"{row['mean_test_score']}")

In [None]:
# 1: {'max_depth': 15, 'num_leaves': 60, 'reg_alpha': 0.01} 0.8338051795879502
rs.best_params_

## ベイズ最適化

In [None]:
# データの準備
dataset = fetch_california_housing()
X = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
y = dataset['target']
# 関数
def objective(params):
    
    # paramsに入っている値は全てfloatなので，intにcastする
    params = {'num_leaves': int(params['num_leaves']),
             'max_depth': int(params['max_depth']), 
             'reg_alpha': params['reg_alpha']}
    model = lgb.LGBMRegressor(**params, random_state=0)
    cv = KFold(n_splits=3, shuffle=True, random_state=0)
    scores = cross_val_score(model, X, y, cv=cv)
    
    # hyperoptにはlogを取る仕組みがないため別途logを保存させる
    log['params'].append(params)
    log['score'].append(scores.mean())
    log['score_std'].append(scores.std())
    
    # 最小化をすることを目指すので，scoreは負の形にして低ければ精度が良いという形にする
    return -scores.mean()

space = {'num_leaves': hp.quniform('num_leaves', 10, 60, 2),
         'max_depth': hp.quniform('max_depth', 5, 15, 2), 
         'reg_alpha': hp.uniform('reg_alpha', 0, 0.03)}

log = {'params': [], 'score': [], 'score_std': []}
best = fmin(objective, space=space, algo=tpe.suggest, max_evals=100)

In [None]:
# logを確認
log_df = pd.DataFrame(log)
log_df = log_df.sort_values('score', ascending=False)
log_df[['num_leaves', 'max_depth', 'reg_alpha']] = log_df.apply(lambda row: pd.Series(row['params']), axis=1)
log_df.head(3)

In [None]:
# ベイズ最適化により選択された100個のハイパーパラメータの組み合わせを描画
# スコアにより色を変えて，スコアが高い付近に組み合わせが集中していることが確認できる
fig = px.scatter_3d(log_df, x='num_leaves', y='max_depth', z='reg_alpha', color='score')
fig.show()