L1正則化、L2正則化ともに適応したLGBM

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score


In [21]:
# データの読み込み
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# 特徴量とターゲット変数の分割

X_test = train.drop('dengue', axis=1)  
y_test = train['dengue']

In [22]:
#class weight=NoneのLGBM
def lgb_evaluate(num_leaves, max_depth, learning_rate, n_estimators, min_child_samples, subsample, colsample_bytree,lambda_l1,lambda_l2):
    model = LGBMClassifier(
        num_leaves=int(num_leaves),
        max_depth=int(max_depth),
        learning_rate=learning_rate,
        n_estimators=int(n_estimators),
        min_child_samples=int(min_child_samples),
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        lambda_l1=lambda_l1,
        lambda_l2=lambda_l2
        )
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return accuracy_score(y_test, predictions)


In [23]:
params = {
    'num_leaves': (20, 50),
    'max_depth': (14, 20),
    'learning_rate': (0.09, 1),
    'n_estimators': (150, 250),
    'min_child_samples': (50, 100),
    'subsample': (0.1, 1),
    'colsample_bytree': (0.3, 0.5),
    'lambda_l1' :(0.1,10),
    'lambda_l2' :(0.1,10)
}

# Bayesian Optimization の実行
optimizer = BayesianOptimization(f=lgb_evaluate, pbounds=params, random_state=42)
optimizer.maximize(init_points=5, n_iter=30)

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 1527, number of negative: 1981
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 3508, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.435291 -> initscore=-0.260297
[LightGBM] [Info] Start training from score -0.260297
| [39m1        [39m | [39m0.5158   [39m | [39m0.3749   [39m | [39m9.512    [39m | [39m7.347    [39m | [39m0.6348   [39m | [39m14.94    [39m | [39m57.8     [39m | 

In [24]:
best_params = optimizer.max['params']
print(optimizer.max)

{'target': 0.5295080934685394, 'params': {'colsample_bytree': 0.3, 'lambda_l1': 3.6365140808329874, 'lambda_l2': 7.911821187385673, 'learning_rate': 1.0, 'max_depth': 17.13366387808514, 'min_child_samples': 84.95380033611424, 'n_estimators': 204.08754393337284, 'num_leaves': 44.69672720025445, 'subsample': 1.0}}


In [25]:
# パラメータを整数に変換（num_leaves, max_depth, n_estimatorsは整数が必要）
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['min_child_samples'] = int(best_params['min_child_samples'])


In [26]:
model = LGBMClassifier(**best_params)
model.fit(X_train, y_train)
model.booster_.save_model('optimized_lgbm2.txt')

[LightGBM] [Info] Number of positive: 1527, number of negative: 1981
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000218 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 3508, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.435291 -> initscore=-0.260297
[LightGBM] [Info] Start training from score -0.260297


<lightgbm.basic.Booster at 0x7b7790934f90>

In [27]:
# テストデータを使って予測
y_pred = model.predict(X_test)

# 精度の評価
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# 分類レポートの表示
print(classification_report(y_test, y_pred))


Accuracy: 0.5295080934685394
              precision    recall  f1-score   support

           0       0.57      0.69      0.62      8887
           1       0.45      0.32      0.38      6990

    accuracy                           0.53     15877
   macro avg       0.51      0.51      0.50     15877
weighted avg       0.52      0.53      0.51     15877



In [28]:
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.3,
 'importance_type': 'split',
 'learning_rate': 1.0,
 'max_depth': 17,
 'min_child_samples': 84,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 204,
 'n_jobs': None,
 'num_leaves': 44,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'lambda_l1': 3.6365140808329874,
 'lambda_l2': 7.911821187385673}