L2正則化を適応したLGBM

In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score


In [73]:
# データの読み込み
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# 特徴量とターゲット変数の分割

X_test = train.drop('dengue', axis=1)  
y_test = train['dengue']


In [74]:
#class weight=NoneのLGBM
def lgb_evaluate(num_leaves, max_depth, learning_rate, n_estimators, min_child_samples, subsample, colsample_bytree,lambda_l2):
    model = LGBMClassifier(
        num_leaves=int(num_leaves),
        max_depth=int(max_depth),
        learning_rate=learning_rate,
        n_estimators=int(n_estimators),
        min_child_samples=int(min_child_samples),
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        lambda_l2=lambda_l2
        )
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return accuracy_score(y_test, predictions)


In [75]:
params = {
    'num_leaves': (1, 1000),
    'max_depth': (1, 50),
    'learning_rate': (0.0001, 1),
    'n_estimators': (1, 1000),
    'min_child_samples': (1, 1000),
    'subsample': (0.1, 1),
    'colsample_bytree': (0.3, 0.5),
    'lambda_l2' :(0.00001,10)
}

# Bayesian Optimization の実行
optimizer = BayesianOptimization(f=lgb_evaluate, pbounds=params, random_state=42)
optimizer.maximize(init_points=5, n_iter=30)

|   iter    |  target   | colsam... | lambda_l2 | learni... | max_depth | min_ch... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 2600, number of negative: 2970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 5570, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466786 -> initscore=-0.133051
[LightGBM] [Info] Start training from score -0.133051
| [39m1        [39m | [39m0.5356   [39m | [39m0.3749   [39m | [39m9.507    [39m | [39m0.732    [39m | [39m30.33    [39m | [39m156.9    [39m | [39m156.8    [39m | [39m59.03    [39m | [3

In [76]:
best_params = optimizer.max['params']
print(optimizer.max)

{'target': 0.5435596392717372, 'params': {'colsample_bytree': 0.34588581763892695, 'lambda_l2': 2.5033112962045045, 'learning_rate': 0.1543736105634415, 'max_depth': 3.1191247097499804, 'min_child_samples': 809.9144006790766, 'n_estimators': 6.637893381728849, 'num_leaves': 464.31608208038745, 'subsample': 0.18277046546111592}}


In [77]:
# パラメータを整数に変換（num_leaves, max_depth, n_estimatorsは整数が必要）
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['min_child_samples'] = int(best_params['min_child_samples'])


In [78]:
model = LGBMClassifier(**best_params)
model.fit(X_train, y_train)
model.booster_.save_model('optimized_lgbm3.txt')

[LightGBM] [Info] Number of positive: 2600, number of negative: 2970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000060 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 5570, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466786 -> initscore=-0.133051
[LightGBM] [Info] Start training from score -0.133051


<lightgbm.basic.Booster at 0x7a228d0be590>

In [79]:
# テストデータを使って予測
y_pred = model.predict(X_test)

# 精度の評価
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# 分類レポートの表示
print(classification_report(y_test, y_pred))


Accuracy: 0.5435596392717372
              precision    recall  f1-score   support

           0       0.54      1.00      0.70      6389
           1       0.00      0.00      0.00      5365

    accuracy                           0.54     11754
   macro avg       0.27      0.50      0.35     11754
weighted avg       0.30      0.54      0.38     11754



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [80]:
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.34588581763892695,
 'importance_type': 'split',
 'learning_rate': 0.1543736105634415,
 'max_depth': 3,
 'min_child_samples': 809,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 6,
 'n_jobs': None,
 'num_leaves': 464,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 0.18277046546111592,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'lambda_l2': 2.5033112962045045}