In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score


In [59]:
# データの読み込み
train = pd.read_csv('../../data/train.csv')
test = pd.read_csv('../../data/test.csv')

# 特徴量とターゲット変数の分割
X_train = train.drop('dengue', axis=1)  
y_train = train['dengue']

X_test = test.drop('dengue', axis=1)  
y_test = test['dengue']


In [60]:
model = LGBMClassifier(class_weight='balanced',)


In [61]:
param_dist = {
    'num_leaves': np.arange(380,400, 1),
    'learning_rate': np.logspace(-0.02, 0, 10),
    'n_estimators': np.arange(240, 280, 1),
    'max_depth': np.arange(3, 15, 1),
    'lambda_l2' :np.arange(0.00001,0.00005,10)
}

In [62]:
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=50,  # 試行回数
    scoring='accuracy',  # 評価指標
    cv=5,  # クロスバリデーション
    verbose=1,
    random_state=42
)


In [63]:
# X_train, y_trainは学習データとラベル
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[LightGBM] [Info] Number of positive: 2860, number of negative: 3244
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031738 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 6104, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 2860, number of negative: 3245
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000810 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 

In [64]:
print("Best Parameters: ", random_search.best_params_)

Best Parameters:  {'num_leaves': np.int64(383), 'n_estimators': np.int64(253), 'max_depth': np.int64(6), 'learning_rate': np.float64(0.9847666521101581), 'lambda_l2': np.float64(1e-05)}


In [65]:
model = LGBMClassifier(**random_search.best_params_)
model.fit(X_train, y_train)
#model.booster_.save_model('optimized_LGBM_random.txt')

[LightGBM] [Info] Number of positive: 3575, number of negative: 4056
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016882 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 7631, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468484 -> initscore=-0.126232
[LightGBM] [Info] Start training from score -0.126232


In [66]:
# テストデータを使って予測
y_pred = model.predict(X_test)

# 精度の評価
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# 分類レポートの表示
print(classification_report(y_test, y_pred))

Accuracy: 0.5458689458689459
              precision    recall  f1-score   support

           0       0.68      0.58      0.63      1157
           1       0.37      0.48      0.42       598

    accuracy                           0.55      1755
   macro avg       0.53      0.53      0.52      1755
weighted avg       0.58      0.55      0.56      1755

