L2正則化を適応したLGBM

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score


In [11]:
# データの読み込み
train = pd.read_csv('../../data/train.csv')
test = pd.read_csv('../../data/test.csv')

# 特徴量とターゲット変数の分割
X_train = train.drop('dengue', axis=1)  
y_train = train['dengue']

X_test = test.drop('dengue', axis=1)  
y_test = test['dengue']


In [12]:
#class weight=NoneのLGBM
def lgb_evaluate(num_leaves, max_depth, learning_rate, n_estimators, min_child_samples, subsample, colsample_bytree,lambda_l2):
    model = LGBMClassifier(
        num_leaves=int(num_leaves),
        max_depth=int(max_depth),
        learning_rate=learning_rate,
        n_estimators=int(n_estimators),
        min_child_samples=int(min_child_samples),
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        lambda_l2=lambda_l2,
        class_weight='balanced'
        )
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return accuracy_score(y_test, predictions)


In [13]:
params = {
    'num_leaves': (1, 50),
    'max_depth': (1, 100),
    'learning_rate': (0.0001, 1),
    'n_estimators': (1, 1000),
    'min_child_samples': (1, 50),
    'subsample': (0.1, 1),
    'colsample_bytree': (0.1, 1),
    'lambda_l2' :(0.00001,10)
}

# Bayesian Optimization の実行
optimizer = BayesianOptimization(f=lgb_evaluate, pbounds=params, random_state=42)
optimizer.maximize(init_points=5, n_iter=30)

|   iter    |  target   | colsam... | lambda_l2 | learni... | max_depth | min_ch... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 3575, number of negative: 4056
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012742 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 7631, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
| [39m1        [39m | [39m0.5852   [39m | [39m0.4371   [39m | [39m9.507    [39m | [39m0.732    [39m | [39m60.27    [39m | [39m8.645    [39m | [39m156.8    [39m | [39m3.846    [39m | [39m

In [14]:
best_params = optimizer.max['params']
print(optimizer.max)

{'target': np.float64(0.6170940170940171), 'params': {'colsample_bytree': np.float64(0.37185975737360477), 'lambda_l2': np.float64(7.536312808453843), 'learning_rate': np.float64(0.7056219507744071), 'max_depth': np.float64(59.88628747823721), 'min_child_samples': np.float64(7.442553713003516), 'n_estimators': np.float64(157.0471459195297), 'num_leaves': np.float64(3.673459765214692), 'subsample': np.float64(0.7897094358038023)}}


In [15]:
# パラメータを整数に変換（num_leaves, max_depth, n_estimatorsは整数が必要）
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['min_child_samples'] = int(best_params['min_child_samples'])


In [16]:
model = LGBMClassifier(**best_params)
model.fit(X_train, y_train)
model.booster_.save_model('optimized_lgbm_byse.txt')

[LightGBM] [Info] Number of positive: 3575, number of negative: 4056
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 7631, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468484 -> initscore=-0.126232
[LightGBM] [Info] Start training from score -0.126232


<lightgbm.basic.Booster at 0x7fa385db6510>

In [17]:
# テストデータを使って予測
y_pred = model.predict(X_test)

# 精度の評価
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# 分類レポートの表示
print(classification_report(y_test, y_pred))


Accuracy: 0.6193732193732193
              precision    recall  f1-score   support

           0       0.66      0.89      0.76      1157
           1       0.31      0.09      0.14       598

    accuracy                           0.62      1755
   macro avg       0.48      0.49      0.45      1755
weighted avg       0.54      0.62      0.55      1755



In [18]:
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': np.float64(0.37185975737360477),
 'importance_type': 'split',
 'learning_rate': np.float64(0.7056219507744071),
 'max_depth': 59,
 'min_child_samples': 7,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 157,
 'n_jobs': None,
 'num_leaves': 3,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': np.float64(0.7897094358038023),
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'lambda_l2': np.float64(7.536312808453843)}