### ML Kaggle midterm Hyper Parameter Tuning (Logistic Regression Ver.)

In [None]:
import pandas as pd
import numpy as np
import optuna
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

warnings.filterwarnings("ignore")

In [28]:
# 데이터 로드
train = pd.read_csv(r'C:\Users\user\Desktop\iris-train.csv')
test = pd.read_csv(r'C:\Users\user\Desktop\iris-test.csv')
sample = pd.read_csv(r'C:\Users\user\Desktop\sample_submit.csv')

# 전처리
le = LabelEncoder()
train['species'] = le.fit_transform(train['species'])
X = train.drop(['species'], axis=1)
y = train['species']
X_test = test
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### 1. Grid Searching Tuning

In [29]:
# 2. GridSearchCV 설정
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],  # saga는 l1/l2 모두 지원
    'multi_class': ['ovr', 'multinomial']
}
logreg = LogisticRegression(max_iter=1000)
grid = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

### 2. Optuna Tuning

In [30]:
# 2. Optuna objective 함수 정의
def objective(trial):
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    C = trial.suggest_loguniform('C', 1e-3, 1e2)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
    multi_class = trial.suggest_categorical('multi_class', ['ovr', 'multinomial'])

    # 일부 조합은 지원되지 않음 → skip 처리
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l2' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if solver == 'liblinear' and multi_class == 'multinomial':
        raise optuna.exceptions.TrialPruned()

    model = LogisticRegression(
        penalty=penalty,
        C=C,
        solver=solver,
        multi_class=multi_class,
        max_iter=1000
    )
    score = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy').mean()
    return score

# 3. Optuna 스터디 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 4. 결과 출력
print("Best Parameters:", study.best_params)
print("Best Accuracy:", study.best_value)

[I 2025-04-19 16:05:13,513] A new study created in memory with name: no-name-4b208f6c-83b4-4fb4-b0fb-70192ce880ab
[I 2025-04-19 16:05:13,577] Trial 0 finished with value: 0.3333333333333333 and parameters: {'penalty': 'l1', 'C': 0.016666269065422117, 'solver': 'liblinear', 'multi_class': 'ovr'}. Best is trial 0 with value: 0.3333333333333333.
[I 2025-04-19 16:05:13,710] Trial 1 finished with value: 0.3333333333333333 and parameters: {'penalty': 'l1', 'C': 0.004546191557819325, 'solver': 'liblinear', 'multi_class': 'ovr'}. Best is trial 0 with value: 0.3333333333333333.
[I 2025-04-19 16:05:13,820] Trial 2 finished with value: 0.9142857142857143 and parameters: {'penalty': 'l2', 'C': 2.5172039216990014, 'solver': 'saga', 'multi_class': 'ovr'}. Best is trial 2 with value: 0.9142857142857143.
[I 2025-04-19 16:05:13,879] Trial 3 finished with value: 0.8380952380952381 and parameters: {'penalty': 'l2', 'C': 0.009198320195100553, 'solver': 'liblinear', 'multi_class': 'ovr'}. Best is trial 2 w

Best Parameters: {'penalty': 'l1', 'C': 1.0388711637834593, 'solver': 'saga', 'multi_class': 'multinomial'}
Best Accuracy: 0.9523809523809523


### 3. Random Search Tuning

In [31]:
# 4. 랜덤 서치 파라미터 설정
param_dist = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-3, 2, 100),  # 0.001 ~ 100 사이 로그스케일
    'solver': ['liblinear', 'saga'],
    'multi_class': ['ovr', 'multinomial']
}

# 5. 랜덤 서치 실행
model = LogisticRegression(max_iter=1000)
random_search = RandomizedSearchCV(model, param_distributions=param_dist, 
                                   n_iter=50, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)


### 4. Compare Results

In [34]:
from sklearn.metrics import accuracy_score

# 1. GridSearchCV 결과
grid_best_model = grid.best_estimator_
y_val_pred_grid = grid_best_model.predict(X_val)
val_acc_grid = accuracy_score(y_val, y_val_pred_grid)

# 2. Optuna 결과
optuna_best_params = study.best_params
optuna_best_model = LogisticRegression(
    penalty=optuna_best_params['penalty'],
    C=optuna_best_params['C'],
    solver=optuna_best_params['solver'],
    multi_class=optuna_best_params['multi_class'],
    max_iter=1000
)
optuna_best_model.fit(X_train, y_train)
y_val_pred_optuna = optuna_best_model.predict(X_val)
val_acc_optuna = accuracy_score(y_val, y_val_pred_optuna)

# 3. RandomizedSearchCV 결과
random_best_model = random_search.best_estimator_
y_val_pred_random = random_best_model.predict(X_val)
val_acc_random = accuracy_score(y_val, y_val_pred_random)

# ✅ 결과 비교 출력
print("📊 튜닝 결과 비교")
print("-" * 50)
print(f"🔍 GridSearchCV")
print(f"  - Best Params: {grid.best_params_}")
print(f"  - CV Accuracy: {grid.best_score_:.6f}")
print(f"  - Validation Accuracy: {val_acc_grid:.6f}")
print("-" * 50)
print(f"🎯 Optuna")
print(f"  - Best Params: {study.best_params}")
print(f"  - CV Accuracy: {study.best_value:.6f}")
print(f"  - Validation Accuracy: {val_acc_optuna:.6f}")
print("-" * 50)
print(f"🎲 RandomizedSearchCV")
print(f"  - Best Params: {random_search.best_params_}")
print(f"  - CV Accuracy: {random_search.best_score_:.6f}")
print(f"  - Validation Accuracy: {val_acc_random:.6f}")
print("-" * 50)

print("GridSearchCV Accuracy:", accuracy_score(y_val, grid.best_estimator_.predict(X_val)))
print("Optuna Accuracy:", accuracy_score(y_val, optuna_best_model.predict(X_val)))
print("RandomizedSearchCV Accuracy:", accuracy_score(y_val, random_search.best_estimator_.predict(X_val)))



📊 튜닝 결과 비교
--------------------------------------------------
🔍 GridSearchCV
  - Best Params: {'C': 10, 'multi_class': 'multinomial', 'penalty': 'l1', 'solver': 'saga'}
  - CV Accuracy: 0.988235
  - Validation Accuracy: 1.000000
--------------------------------------------------
🎯 Optuna
  - Best Params: {'penalty': 'l1', 'C': 1.0388711637834593, 'solver': 'saga', 'multi_class': 'multinomial'}
  - CV Accuracy: 0.952381
  - Validation Accuracy: 0.904762
--------------------------------------------------
🎲 RandomizedSearchCV
  - Best Params: {'solver': 'saga', 'penalty': 'l1', 'multi_class': 'multinomial', 'C': 2.1544346900318843}
  - CV Accuracy: 0.988235
  - Validation Accuracy: 0.904762
--------------------------------------------------
GridSearchCV Accuracy: 1.0
Optuna Accuracy: 0.9047619047619048
RandomizedSearchCV Accuracy: 0.9047619047619048


### 5. testing & submit code

In [None]:
# 🔧 선택한 최적 모델 사용 (여기선 GridSearchCV 결과 사용)
final_model = grid.best_estimator_

# 전체 훈련 데이터로 재학습 (val 나누지 않고 전체 사용)
final_model.fit(X_scaled, y)

# 테스트셋 예측
y_test_pred = final_model.predict(X_test_scaled)
y_test_pred_labels = le.inverse_transform(y_test_pred)  # 숫자 → 원래 클래스 이름

# 제출 파일 생성
submission = sample.copy()
submission['species'] = y_test_pred_labels
submission.to_csv('iris_final_bestmodel_submission.csv', index=False)

print("최종 제출파일 생성 완료: iris_final_bestmodel_submission.csv")


✅ 최종 제출파일 생성 완료: iris_final_bestmodel_submission.csv
