In [18]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
import lightgbm as lgb

In [21]:
dataset = fetch_california_housing()
X = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
y = dataset['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
# モデル
lgbmr = lgb.LGBMRegressor(random_state=0)

# gridsearchしたいパラメータ
param_grid = {'num_leaves': [10, 20, 30, 40, 50, 60],
              'max_depth': [5, 10, 15], 
              'reg_alpha': [0, 0.01, 0.03]}

cv = KFold(n_splits=3, random_state=0, shuffle=True)
gs = GridSearchCV(lgbmr, param_grid=param_grid, cv=cv)


もし、max_depth：5でnum_leaves：10-30で　max_depth:10-60でmax_depth:10-60と条件で分けたいなら、  
二つのparam_gridを作成して回すことになる。

param_grid_1 = {'num_leaves': [10, 20, 30],  
              'max_depth': [5],   
              'reg_alpha': [0, 0.01, 0.03]}  
              
param_grid_2 = {'num_leaves': [10, 20, 30, 40, 50, 60],  
              'max_depth': [10, 15],   
              'reg_alpha': [0, 0.01, 0.03]}  

In [23]:
# fitにおけるearly_stoppingの設定

callbacks = [lgb.early_stopping(stopping_rounds=10)]
eval_set = [(X_val, y_val)]
fit_params = {'callbacks': callbacks, 'eval_set': eval_set} 

gs.fit(X_train, y_train, **fit_params)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 11008, number of used features: 8
[LightGBM] [Info] Start training from score 2.064558
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.264841
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000205 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 11008, number of used features: 8
[LightGBM] [Info] Start training from score 2.075166
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	v

In [27]:
cv_result_df = pd.DataFrame(gs.cv_results_)
cv_result_df.head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_num_leaves,param_reg_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.132312,0.070383,0.011082,0.001365,5,10,0.0,"{'max_depth': 5, 'num_leaves': 10, 'reg_alpha'...",0.797096,0.807785,0.805461,0.803448,0.00459,54
1,0.108334,0.045276,0.01037,0.000627,5,10,0.01,"{'max_depth': 5, 'num_leaves': 10, 'reg_alpha'...",0.798894,0.806981,0.80605,0.803975,0.003613,53
2,0.079952,0.007273,0.010238,0.000201,5,10,0.03,"{'max_depth': 5, 'num_leaves': 10, 'reg_alpha'...",0.799277,0.806364,0.806658,0.8041,0.003413,52


In [32]:
for index, row in cv_result_df[cv_result_df['rank_test_score']<=5].iterrows():
    print(f'{row["rank_test_score"]}: {row["params"]}')
    print(f'{row["mean_test_score"]}')    

4: {'max_depth': 10, 'num_leaves': 50, 'reg_alpha': 0.01}
0.8329725903236045
3: {'max_depth': 15, 'num_leaves': 50, 'reg_alpha': 0}
0.8331058990226472
2: {'max_depth': 15, 'num_leaves': 50, 'reg_alpha': 0.03}
0.8332059773155042
5: {'max_depth': 15, 'num_leaves': 60, 'reg_alpha': 0}
0.8327661059213919
1: {'max_depth': 15, 'num_leaves': 60, 'reg_alpha': 0.01}
0.8338051795879502


In [35]:
lgbmr = lgb.LGBMRegressor(**gs.best_params_, random_state=0, learning_rate=0.01, n_estimators=1000)
lgbmr.fit(X_train, y_train, **fit_params)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000760 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.072499
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.195693


In [36]:
lgbmr.score(X_val, y_val)

0.8499232425233268

## PIpeline + Grid Search

In [111]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
warnings.simplefilter('ignore')

In [58]:
df = pd.read_csv('document/penguins_size.csv')


# データクリーニング
df.loc[df['sex']=='.', 'sex'] = np.nan
df.dropna(thresh=3, inplace=True)


# データ準備
X = df.drop('species', axis=1)
y = df['species']

In [120]:
# Pipeline + Gridsearch
#----必要な前処理--------
# カテゴリ変数のダミー変数化
# 欠損値代入


# 欠損値代入(最頻値)
imputer = SimpleImputer(strategy='most_frequent')
cat_cols = make_column_selector(dtype_exclude=np.number)

# defaultだとremainderはdropで変換対象以外のカラムは落ちる。'passthrough'への変更忘れずに！！
ct = ColumnTransformer([('cat_imputer', imputer, cat_cols)], remainder='passthrough')
ct.set_output(transform='pandas')

# ohe-hot encoding
class GetDummies(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.columns = None
    
    def fit(self, X, y=None):
        self.columns = pd.get_dummies(X).columns
        return self
    
    def transform(self, X):
        X_new = pd.get_dummies(X, )
        return X_new.reindex(columns=self.columns, fill_value=0)
    
    
# model
lgbmc =  lgb.LGBMClassifier(random_state=0, verbose=-1)


# pipeline
pipeline = Pipeline([('impute', ct), ('dummy', GetDummies()), ('model', lgbmc)])


# cv
cv = KFold(n_splits=3, random_state=0, shuffle=True)


# grid search
# piplenのGrid Searchなのでkeyの接頭辞にモデル名を入れる必要がある 
param_grid = {'model__num_leaves': [10, 20, 30, 40, 50, 60],
              'model__max_depth': [5, 10, 15], 
              'model__reg_alpha': [0, 0.01, 0.03]}
gs = GridSearchCV(pipeline, param_grid=param_grid, cv=cv)


In [123]:
gs.fit(X, y)

In [131]:
pd.DataFrame(gs.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__num_leaves,param_model__reg_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.109075,0.072923,0.013489,0.001035,5,10,0.0,"{'model__max_depth': 5, 'model__num_leaves': 1...",0.991228,0.982456,0.991228,0.988304,0.004135,1
33,0.060038,0.001558,0.01277,0.000355,10,60,0.0,"{'model__max_depth': 10, 'model__num_leaves': ...",0.991228,0.982456,0.991228,0.988304,0.004135,1
27,0.058181,0.002305,0.012484,0.000334,10,40,0.0,"{'model__max_depth': 10, 'model__num_leaves': ...",0.991228,0.982456,0.991228,0.988304,0.004135,1
24,0.058732,0.003683,0.013267,0.000367,10,30,0.0,"{'model__max_depth': 10, 'model__num_leaves': ...",0.991228,0.982456,0.991228,0.988304,0.004135,1
36,0.053567,0.000852,0.012693,0.000822,15,10,0.0,"{'model__max_depth': 15, 'model__num_leaves': ...",0.991228,0.982456,0.991228,0.988304,0.004135,1
21,0.069846,0.006439,0.013699,0.000775,10,20,0.0,"{'model__max_depth': 10, 'model__num_leaves': ...",0.991228,0.982456,0.991228,0.988304,0.004135,1
39,0.054289,0.001411,0.011731,0.000896,15,20,0.0,"{'model__max_depth': 15, 'model__num_leaves': ...",0.991228,0.982456,0.991228,0.988304,0.004135,1
18,0.057165,0.008436,0.013505,0.001115,10,10,0.0,"{'model__max_depth': 10, 'model__num_leaves': ...",0.991228,0.982456,0.991228,0.988304,0.004135,1
42,0.05535,0.001836,0.012257,0.001164,15,30,0.0,"{'model__max_depth': 15, 'model__num_leaves': ...",0.991228,0.982456,0.991228,0.988304,0.004135,1
12,0.053711,0.001521,0.012665,0.000691,5,50,0.0,"{'model__max_depth': 5, 'model__num_leaves': 5...",0.991228,0.982456,0.991228,0.988304,0.004135,1


In [153]:
def test(b,a,c):
  print('a : ', a)
  print('b : ', b)
  print('c : ', c)

In [154]:
arg = {
  'a': '123',
  'b': '456',
  'c': '789',
}

test(**arg)

a :  123
b :  456
c :  789
