L2正則化を適応したLGBM

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score


In [2]:
# データの読み込み
data = pd.read_csv('../data/preprocessed_data.csv')

# 特徴量とターゲット変数の分割
X = data.drop('dengue', axis=1)  # 'target'はターゲット列の名前です
y = data['dengue']


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
#class weight=NoneのLGBM
def lgb_evaluate(num_leaves, max_depth, learning_rate, n_estimators, min_child_samples, subsample, colsample_bytree,lambda_l2):
    model = LGBMClassifier(
        num_leaves=int(num_leaves),
        max_depth=int(max_depth),
        learning_rate=learning_rate,
        n_estimators=int(n_estimators),
        min_child_samples=int(min_child_samples),
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        lambda_l2=lambda_l2
        )
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return accuracy_score(y_test, predictions)


In [5]:
params = {
    'num_leaves': (20, 50),
    'max_depth': (14, 20),
    'learning_rate': (0.09, 1),
    'n_estimators': (150, 250),
    'min_child_samples': (50, 100),
    'subsample': (0.1, 1),
    'colsample_bytree': (0.3, 0.5),
    'lambda_l2' :(0.1,10)
}

# Bayesian Optimization の実行
optimizer = BayesianOptimization(f=lgb_evaluate, pbounds=params, random_state=42)
optimizer.maximize(init_points=5, n_iter=30)

|   iter    |  target   | colsam... | lambda_l2 | learni... | max_depth | min_ch... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 1527, number of negative: 1981
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 3508, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.435291 -> initscore=-0.260297
[LightGBM] [Info] Start training from score -0.260297
| [39m1        [39m | [39m0.6344   [39m | [39m0.3749   [39m | [39m9.512    [39m | [39m0.7561   [39m | [39m17.59    [39m | [39m57.8     [39m | [39m165.6    [39m | [39m21.74    [39m | [3

In [6]:
best_params = optimizer.max['params']
print(optimizer.max)

{'target': np.float64(0.6548974943052391), 'params': {'colsample_bytree': np.float64(0.3113956167067937), 'lambda_l2': np.float64(0.7059815679835948), 'learning_rate': np.float64(0.7656539769716649), 'max_depth': np.float64(14.809664722541571), 'min_child_samples': np.float64(91.08625783887638), 'n_estimators': np.float64(199.57847941543835), 'num_leaves': np.float64(44.46298377534453), 'subsample': np.float64(0.2188705076885123)}}


In [7]:
# パラメータを整数に変換（num_leaves, max_depth, n_estimatorsは整数が必要）
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['min_child_samples'] = int(best_params['min_child_samples'])


In [8]:
model = LGBMClassifier(**best_params)
model.fit(X_train, y_train)
model.booster_.save_model('optimized_lgbm2.txt')

[LightGBM] [Info] Number of positive: 1527, number of negative: 1981
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 3508, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.435291 -> initscore=-0.260297
[LightGBM] [Info] Start training from score -0.260297




<lightgbm.basic.Booster at 0x7fd360bd54c0>

In [9]:
# テストデータを使って予測
y_pred = model.predict(X_test)

# 精度の評価
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# 分類レポートの表示
print(classification_report(y_test, y_pred))


Accuracy: 0.6548974943052391
              precision    recall  f1-score   support

           0       0.69      0.70      0.69       491
           1       0.61      0.60      0.61       387

    accuracy                           0.65       878
   macro avg       0.65      0.65      0.65       878
weighted avg       0.65      0.65      0.65       878



In [10]:
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': np.float64(0.3113956167067937),
 'importance_type': 'split',
 'learning_rate': np.float64(0.7656539769716649),
 'max_depth': 14,
 'min_child_samples': 91,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 199,
 'n_jobs': None,
 'num_leaves': 44,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': np.float64(0.2188705076885123),
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'lambda_l2': np.float64(0.7059815679835948)}