In [1]:
!pip install optuna scikit-learn

import pandas as pd
import optuna
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 讀取資料集
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# 特徵工程
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 模型選擇：訓練並比較不同模型
rf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(random_state=42)
lr_model = LogisticRegression(random_state=42)

models = [rf_model, svm_model, lr_model]
model_names = ["Random Forest", "SVM", "Logistic Regression"]

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Model: {name}, Accuracy: {accuracy:.4f}")

# Optuna 超參數優化
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("Best parameters: ", study.best_params)
print("Best accuracy: ", study.best_value)

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2024-12-11 13:13:09,643] A new study created in memory with name: no-name-0e78a303-c5d2-4016-a580-f8345a0d2af2


Model: Random Forest, Accuracy: 0.9649
Model: SVM, Accuracy: 0.9474
Model: Logistic Regression, Accuracy: 0.9561


[I 2024-12-11 13:13:10,033] Trial 0 finished with value: 0.9649122807017544 and parameters: {'n_estimators': 130, 'max_depth': 10, 'min_samples_split': 10}. Best is trial 0 with value: 0.9649122807017544.
[I 2024-12-11 13:13:10,317] Trial 1 finished with value: 0.9649122807017544 and parameters: {'n_estimators': 113, 'max_depth': 8, 'min_samples_split': 6}. Best is trial 0 with value: 0.9649122807017544.
[I 2024-12-11 13:13:10,480] Trial 2 finished with value: 0.956140350877193 and parameters: {'n_estimators': 62, 'max_depth': 9, 'min_samples_split': 5}. Best is trial 0 with value: 0.9649122807017544.
[I 2024-12-11 13:13:10,873] Trial 3 finished with value: 0.9649122807017544 and parameters: {'n_estimators': 153, 'max_depth': 10, 'min_samples_split': 5}. Best is trial 0 with value: 0.9649122807017544.
[I 2024-12-11 13:13:11,057] Trial 4 finished with value: 0.9649122807017544 and parameters: {'n_estimators': 72, 'max_depth': 8, 'min_samples_split': 10}. Best is trial 0 with value: 0.96

Best parameters:  {'n_estimators': 130, 'max_depth': 10, 'min_samples_split': 10}
Best accuracy:  0.9649122807017544
