In [1]:
import optuna
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMClassifier
from optuna.samplers import TPESampler

SEED = 100
NCALLS = 1000

# 'svm': {
#     'kernel': trial.suggest_categorical('svm__kernel', ['rbf', 'sigmoid']),
#     'gamma': trial.suggest_float('svm__gamma', 1e-5, 1),
#     'C': trial.suggest_float('svm__C', 1, 1e2),
#     'epsilon': trial.suggest_float('svm__epsilon', 1e-1, 1e1),
#     'max_iter': 20000,
# },

train_df = pd.read_parquet("../data/sentence_train.pq")
X, y = train_df.drop(columns=['target']).to_numpy(), train_df['target'].to_numpy().astype(np.int8)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED, stratify=train_df['target'])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def objective(trial):

    def suggest_params():
        return {
            'n_estimators': trial.suggest_int('lgbm__n_estimators', 50, 100),
            'max_depth': trial.suggest_int('lgbm__max_depth', 4, 50),
            'min_child_weight': trial.suggest_int('lgbm__min_child_weight', 1, 6),
            'learning_rate': trial.suggest_float('lgbm__learning_rate', 1e-5, 1),
            'reg_alpha': trial.suggest_float('lgbm__reg_alpha', 0, 1e1),
            'reg_lambda': trial.suggest_float('lgbm__reg_lambda', 0, 1e1),
            'verbosity': -1,
            'random_state': SEED
        }
    
    def loss(y, y_pred):
        return -np.sum((y * np.log(y_pred)) + (1 - y) * np.log(1 - y_pred))

    params = suggest_params()

    lgbm = LGBMClassifier(
        **params
    )

    lgbm.fit(X_train, y_train)
    y_pred = lgbm.predict_proba(X_test)[:,1]
    
    return loss(y_test, y_pred)

In [3]:
sampler = TPESampler(seed=SEED)
study = optuna.create_study(sampler=sampler)
study.optimize(objective, n_trials=2)

[I 2024-08-19 11:07:05,645] A new study created in memory with name: no-name-e59de698-9dad-4a15-9a04-589c2f6af616
[I 2024-08-19 11:07:07,078] Trial 0 finished with value: 1545.3190674583593 and parameters: {'lgbm__n_estimators': 77, 'lgbm__max_depth': 17, 'lgbm__min_child_weight': 3, 'lgbm__learning_rate': 0.8447776845585805, 'lgbm__reg_alpha': 0.047188561909725646, 'lgbm__reg_lambda': 1.2156912078311422}. Best is trial 0 with value: 1545.3190674583593.
[I 2024-08-19 11:07:07,941] Trial 1 finished with value: 1061.8282633815088 and parameters: {'lgbm__n_estimators': 84, 'lgbm__max_depth': 42, 'lgbm__min_child_weight': 1, 'lgbm__learning_rate': 0.5750975784939557, 'lgbm__reg_alpha': 8.91321954312264, 'lgbm__reg_lambda': 2.092021221171896}. Best is trial 1 with value: 1061.8282633815088.
