<a href="https://colab.research.google.com/github/KevinCY-Kim/dacon_ML_Smart-shipping-and-logistics/blob/main/%EB%B9%84%EC%A0%95%EC%83%81%EC%9E%91%EB%8F%99_%EB%B6%84%EB%A5%98_with_Optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
import optuna

In [3]:
# =======================
# 1. 데이터 로드
# =======================
train = pd.read_csv("train.csv", sep=",")
test = pd.read_csv("test.csv", sep=",")

X = train.drop(columns=["ID", "target"])
y = train["target"]

X_test = test.drop(columns=["ID"])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [4]:
from sklearn.metrics import f1_score
# =======================
# 2. Optuna 목적 함수 정의
# =======================
def DecisionTreeobjective(trial):
    max_depth = trial.suggest_int("max_depth", 2, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, random_state=4)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return f1_score(y_valid, y_pred, average="macro")

def RandomForestobjective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return f1_score(y_valid, y_pred, average="macro")

def Logisticobjective(trial):
    C = trial.suggest_loguniform("C", 1e-3, 10)
    model = LogisticRegression(C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return f1_score(y_valid, y_pred, average="macro")

In [5]:
# =======================
# 3. Optuna 실행
# =======================
print("Tuning Decision Tree...")
study_dt = optuna.create_study(direction="maximize")
study_dt.optimize(DecisionTreeobjective, n_trials=50)
print("Best params (DT):", study_dt.best_params)

print("Tuning Random Forest...")
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(RandomForestobjective, n_trials=50)
print("Best params (RF):", study_rf.best_params)

print("Tuning Logistic Regression...")
study_lr = optuna.create_study(direction="maximize")
study_lr.optimize(Logisticobjective, n_trials=50)
print("Best params (LR):", study_lr.best_params)

[I 2025-09-22 00:18:25,404] A new study created in memory with name: no-name-4dde6c1c-697e-44ab-ad7f-84dd46d6c71e


Tuning Decision Tree...


[I 2025-09-22 00:18:27,426] Trial 0 finished with value: 0.6252012578868519 and parameters: {'max_depth': 15, 'min_samples_split': 12}. Best is trial 0 with value: 0.6252012578868519.
[I 2025-09-22 00:18:29,141] Trial 1 finished with value: 0.5665552036447213 and parameters: {'max_depth': 12, 'min_samples_split': 3}. Best is trial 0 with value: 0.6252012578868519.
[I 2025-09-22 00:18:30,380] Trial 2 finished with value: 0.4125724559533743 and parameters: {'max_depth': 8, 'min_samples_split': 14}. Best is trial 0 with value: 0.6252012578868519.
[I 2025-09-22 00:18:30,892] Trial 3 finished with value: 0.14750676886317626 and parameters: {'max_depth': 3, 'min_samples_split': 4}. Best is trial 0 with value: 0.6252012578868519.
[I 2025-09-22 00:18:33,298] Trial 4 finished with value: 0.6785409408307076 and parameters: {'max_depth': 20, 'min_samples_split': 10}. Best is trial 4 with value: 0.6785409408307076.
[I 2025-09-22 00:18:35,758] Trial 5 finished with value: 0.6235099765052856 and par

Best params (DT): {'max_depth': 20, 'min_samples_split': 8}
Tuning Random Forest...


[I 2025-09-22 00:20:31,720] Trial 0 finished with value: 0.5328657190274627 and parameters: {'n_estimators': 225, 'max_depth': 5}. Best is trial 0 with value: 0.5328657190274627.
[I 2025-09-22 00:21:01,267] Trial 1 finished with value: 0.7166296723004318 and parameters: {'n_estimators': 155, 'max_depth': 14}. Best is trial 1 with value: 0.7166296723004318.
[I 2025-09-22 00:21:04,184] Trial 2 finished with value: 0.3460360887106805 and parameters: {'n_estimators': 70, 'max_depth': 2}. Best is trial 1 with value: 0.7166296723004318.
[I 2025-09-22 00:21:40,096] Trial 3 finished with value: 0.7475746871400928 and parameters: {'n_estimators': 150, 'max_depth': 20}. Best is trial 3 with value: 0.7475746871400928.
[I 2025-09-22 00:22:08,889] Trial 4 finished with value: 0.605967238249265 and parameters: {'n_estimators': 296, 'max_depth': 6}. Best is trial 3 with value: 0.7475746871400928.
[I 2025-09-22 00:22:20,080] Trial 5 finished with value: 0.6092639296278876 and parameters: {'n_estimator

Best params (RF): {'n_estimators': 182, 'max_depth': 20}
Tuning Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2025-09-22 00:51:15,747] Trial 0 finished with value: 0.49787335054875503 and parameters: {'C': 2.2023737503967267}. Best is trial 0 with value: 0.49787335054875503.
  C = trial.suggest_loguniform("C", 1e-3, 10)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2025-09-22 00:52:01,449] Trial 1

Best params (LR): {'C': 5.969780879014203}


In [6]:
# =======================
# 4. 최적 모델 생성
# =======================
dt_best = DecisionTreeClassifier(**study_dt.best_params, random_state=42)
rf_best = RandomForestClassifier(**study_rf.best_params, random_state=42)
lr_best = LogisticRegression(**study_lr.best_params, max_iter=1000, random_state=42)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, f1_score

# =======================
# 5. 스태킹
# =======================

# GridSearchCV 결과에서 최적 모델 뽑기
dt_best = dt_grid.best_estimator_
rf_best = rf_grid.best_estimator_
lr_best = lr_grid.best_estimator_

estimators = [
    ('dt', dt_best),
    ('rf', rf_best)
]

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=lr_best,
    passthrough=True  # 전방모델 예측결과도 함께 사용
)

# 학습
stack_model.fit(X_train, y_train)

# 검증 예측
y_pred = stack_model.predict(X_valid)

# 성능 출력
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("F1 Score:", f1_score(y_valid, y_pred))

In [7]:
# 위에꺼 사용
# =======================
# 5. 스태킹
# =======================
estimators = [
    ('dt', dt_best),
    ('rf', rf_best)
]

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=lr_best,
    passthrough=True  # 전방모델 예측결과도 함께 사용
)

stack_model.fit(X_train, y_train)
y_pred = stack_model.predict(X_valid)

print("Accuracy:", accuracy_score(y_valid, y_pred))
print("F1 Score:", f1_score(y_valid, y_pred))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7259737266651302


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [8]:
from sklearn.metrics import f1_score, accuracy_score

print("Accuracy:", accuracy_score(y_valid, y_pred))
print("F1 Score (macro):", f1_score(y_valid, y_pred, average="macro"))
print("F1 Score (weighted):", f1_score(y_valid, y_pred, average="weighted"))

Accuracy: 0.7259737266651302
F1 Score (macro): 0.7354822179055702
F1 Score (weighted): 0.7355854639380877


In [9]:
# =======================
# 6. 최종 예측
# =======================
X_test = test.drop(columns=["ID"])
test_pred = stack_model.predict(X_test)

submission = pd.DataFrame({
    "ID": test["ID"],
    "target": test_pred
})

submission.to_csv("submission.csv", index=False)
print("submission.csv 파일 생성 완료!")

submission.csv 파일 생성 완료!
