L1正則化を適応したLGBM

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score


In [22]:
# データの読み込み
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# 特徴量とターゲット変数の分割

X_test = train.drop('dengue', axis=1)  
y_test = train['dengue']

In [23]:
#class weight=NoneのLGBM
def lgb_evaluate(num_leaves, max_depth, learning_rate, n_estimators, min_child_samples, subsample, colsample_bytree,lambda_l1):
    model = LGBMClassifier(
        num_leaves=int(num_leaves),
        max_depth=int(max_depth),
        learning_rate=learning_rate,
        n_estimators=int(n_estimators),
        min_child_samples=int(min_child_samples),
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        lambda_l1=lambda_l1
        )
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return accuracy_score(y_test, predictions)


In [24]:
params = {
    'num_leaves': (20, 50),
    'max_depth': (14, 20),
    'learning_rate': (0.09, 1),
    'n_estimators': (150, 250),
    'min_child_samples': (50, 100),
    'subsample': (0.1, 1),
    'colsample_bytree': (0.3, 0.5),
    'lambda_l1' :(0.1,10)
}

# Bayesian Optimization の実行
optimizer = BayesianOptimization(f=lgb_evaluate, pbounds=params, random_state=42)
optimizer.maximize(init_points=5, n_iter=30)

|   iter    |  target   | colsam... | lambda_l1 | learni... | max_depth | min_ch... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 1527, number of negative: 1981
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000305 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 3508, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.435291 -> initscore=-0.260297
[LightGBM] [Info] Start training from score -0.260297
| [39m1        [39m | [39m0.5192   [39m | [39m0.3749   [39m | [39m9.512    [39m | [39m0.7561   [39m | [39m17.59    [39m | [39m57.8     [39m | [39m165.6    [39m | [39m21.74    [39m | [3

In [25]:
best_params = optimizer.max['params']
print(optimizer.max)

{'target': 0.5288782515588587, 'params': {'colsample_bytree': 0.3130103185970559, 'lambda_l1': 9.493966818808, 'learning_rate': 0.968725150097849, 'max_depth': 18.850384088698767, 'min_child_samples': 65.23068845866854, 'n_estimators': 159.7672114006384, 'num_leaves': 40.52699079536471, 'subsample': 0.4961372443656412}}


In [26]:
# パラメータを整数に変換（num_leaves, max_depth, n_estimatorsは整数が必要）
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['min_child_samples'] = int(best_params['min_child_samples'])


In [27]:
model = LGBMClassifier(**best_params)
model.fit(X_train, y_train)
model.booster_.save_model('optimized_lgbm1.txt')

[LightGBM] [Info] Number of positive: 1527, number of negative: 1981
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 3508, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.435291 -> initscore=-0.260297
[LightGBM] [Info] Start training from score -0.260297


<lightgbm.basic.Booster at 0x76ff42185f10>

In [28]:
# テストデータを使って予測
y_pred = model.predict(X_test)

# 精度の評価
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# 分類レポートの表示
print(classification_report(y_test, y_pred))


Accuracy: 0.5288782515588587
              precision    recall  f1-score   support

           0       0.57      0.69      0.62      8887
           1       0.45      0.33      0.38      6990

    accuracy                           0.53     15877
   macro avg       0.51      0.51      0.50     15877
weighted avg       0.52      0.53      0.51     15877



In [29]:
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.3130103185970559,
 'importance_type': 'split',
 'learning_rate': 0.968725150097849,
 'max_depth': 18,
 'min_child_samples': 65,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 159,
 'n_jobs': None,
 'num_leaves': 40,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 0.4961372443656412,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'lambda_l1': 9.493966818808}