### ML Kaggle midterm (Hyper Parameter Tuning)

### KNN ver.

In [1]:
import pandas as pd
import numpy as np
import optuna
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore")


In [None]:
# Îç∞Ïù¥ÌÑ∞ Î°úÎìú
train = pd.read_csv(r'C:\Users\user\Desktop\iris-train.csv')
test = pd.read_csv(r'C:\Users\user\Desktop\iris-test.csv')
sample = pd.read_csv(r'C:\Users\user\Desktop\sample_submit.csv')

# ÌïÑÏöîÏóÜÎäî Ïª¨Îüº Ï†úÍ±∞
# X = train.drop(['id', 'species'], axis=1) 

# Ï†ÑÏ≤òÎ¶¨
le = LabelEncoder()
train['species'] = le.fit_transform(train['species'])
X = train.drop(['species'], axis=1)
y = train['species']
X_test = test
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### 1. Grid Searching Tuning

In [5]:
param_grid = {
    'n_neighbors': list(range(1, 31)),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1: Manhattan, 2: Euclidean
}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

### 2. Optuna Tuning

In [6]:
def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    p = trial.suggest_categorical('p', [1, 2])

    model = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        weights=weights,
        p=p
    )
    score = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best Parameters:", study.best_params)
print("Best Accuracy:", study.best_value)

[I 2025-04-19 16:19:09,366] A new study created in memory with name: no-name-27f1145b-0495-4bad-85ac-be44f63a060f
[I 2025-04-19 16:19:09,417] Trial 0 finished with value: 0.9142857142857144 and parameters: {'n_neighbors': 22, 'weights': 'uniform', 'p': 1}. Best is trial 0 with value: 0.9142857142857144.
[I 2025-04-19 16:19:09,460] Trial 1 finished with value: 0.9142857142857143 and parameters: {'n_neighbors': 19, 'weights': 'uniform', 'p': 1}. Best is trial 0 with value: 0.9142857142857144.
[I 2025-04-19 16:19:09,496] Trial 2 finished with value: 0.9238095238095239 and parameters: {'n_neighbors': 29, 'weights': 'distance', 'p': 2}. Best is trial 2 with value: 0.9238095238095239.
[I 2025-04-19 16:19:09,544] Trial 3 finished with value: 0.9333333333333333 and parameters: {'n_neighbors': 18, 'weights': 'uniform', 'p': 1}. Best is trial 3 with value: 0.9333333333333333.
[I 2025-04-19 16:19:09,598] Trial 4 finished with value: 0.9047619047619048 and parameters: {'n_neighbors': 28, 'weights'

Best Parameters: {'n_neighbors': 8, 'weights': 'uniform', 'p': 2}
Best Accuracy: 0.9619047619047618


### 3. Random Search Tuning

In [None]:
# ÌååÎùºÎØ∏ÌÑ∞ ÏÑ§Ï†ï
param_dist = {
    'n_neighbors': list(range(1, 31)),             # Ïù¥ÏõÉ Ïàò (1~30)
    'weights': ['uniform', 'distance'],            # Í±∞Î¶¨ Í∞ÄÏ§ëÏπò Î∞©Ïãù
    'p': [1, 2]                                     # Í±∞Î¶¨ Í∏∞Ï§Ä: 1=Îß®Ìï¥Ìäº, 2=Ïú†ÌÅ¥Î¶¨Îìú
}

# Ïã§Ìñâ
random_search = RandomizedSearchCV(
    estimator=knn,
    param_distributions=param_dist,
    n_iter=50,                   # Ï¥ù 50Ìöå ÎûúÎç§ Ï°∞Ìï© ÏãúÎèÑ
    cv=5,                        # 5-fold ÍµêÏ∞®Í≤ÄÏ¶ù
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

# Ï∂úÎ†•
print(" ÏµúÏ†Å ÌååÎùºÎØ∏ÌÑ∞ (KNN Random Search):", random_search.best_params_)
print(" CV Ï†ïÌôïÎèÑ:", random_search.best_score_)


‚úÖ ÏµúÏ†Å ÌååÎùºÎØ∏ÌÑ∞ (KNN Random Search): {'weights': 'distance', 'p': 1, 'n_neighbors': 9}
‚úÖ CV Ï†ïÌôïÎèÑ: 0.9882352941176471


### 4. Compare Results

In [None]:
# 1. GridSearchCV Í≤∞Í≥º
grid_best_model = grid.best_estimator_
y_val_pred_grid = grid_best_model.predict(X_val)
val_acc_grid = accuracy_score(y_val, y_val_pred_grid)

# 2. Optuna Í≤∞Í≥º
optuna_best_params = study.best_params
optuna_best_model = KNeighborsClassifier(
    n_neighbors=optuna_best_params['n_neighbors'],
    weights=optuna_best_params['weights'],
    p=optuna_best_params['p']
)
optuna_best_model.fit(X_train, y_train)
y_val_pred_optuna = optuna_best_model.predict(X_val)
val_acc_optuna = accuracy_score(y_val, y_val_pred_optuna)

# 3. RandomizedSearchCV Í≤∞Í≥º
random_best_model = random_search.best_estimator_
y_val_pred_random = random_best_model.predict(X_val)
val_acc_random = accuracy_score(y_val, y_val_pred_random)

# ‚úÖ Í≤∞Í≥º ÎπÑÍµê Ï∂úÎ†•
print("üìä KNN ÌäúÎãù Í≤∞Í≥º ÎπÑÍµê")
print("-" * 50)
print(f" GridSearchCV")
print(f"  - Best Params: {grid.best_params_}")
print(f"  - CV Accuracy: {grid.best_score_:.6f}")
print(f"  - Validation Accuracy: {val_acc_grid:.6f}")
print("-" * 50)
print(f" Optuna")
print(f"  - Best Params: {study.best_params}")
print(f"  - CV Accuracy: {study.best_value:.6f}")
print(f"  - Validation Accuracy: {val_acc_optuna:.6f}")
print("-" * 50)
print(f" RandomizedSearchCV")
print(f"  - Best Params: {random_search.best_params_}")
print(f"  - CV Accuracy: {random_search.best_score_:.6f}")
print(f"  - Validation Accuracy: {val_acc_random:.6f}")
print("-" * 50)

print("GridSearchCV Accuracy:", accuracy_score(y_val, grid.best_estimator_.predict(X_val)))
print("Optuna Accuracy:", accuracy_score(y_val, optuna_best_model.predict(X_val)))
print("RandomizedSearchCV Accuracy:", accuracy_score(y_val, random_search.best_estimator_.predict(X_val)))


üìä KNN ÌäúÎãù Í≤∞Í≥º ÎπÑÍµê
--------------------------------------------------
üîç GridSearchCV
  - Best Params: {'n_neighbors': 8, 'p': 1, 'weights': 'distance'}
  - CV Accuracy: 0.988235
  - Validation Accuracy: 1.000000
--------------------------------------------------
üéØ Optuna
  - Best Params: {'n_neighbors': 8, 'weights': 'uniform', 'p': 2}
  - CV Accuracy: 0.961905
  - Validation Accuracy: 0.904762
--------------------------------------------------
üé≤ RandomizedSearchCV
  - Best Params: {'weights': 'distance', 'p': 1, 'n_neighbors': 9}
  - CV Accuracy: 0.988235
  - Validation Accuracy: 0.904762
--------------------------------------------------
GridSearchCV Accuracy: 1.0
Optuna Accuracy: 0.9047619047619048
RandomizedSearchCV Accuracy: 0.9047619047619048


### 5. testing & submit code

In [14]:
# üîß ÏÑ†ÌÉùÌïú ÏµúÏ†Å Î™®Îç∏ ÏÇ¨Ïö© (Ïó¨Í∏∞ÏÑ† GridSearchCV Í≤∞Í≥º ÏÇ¨Ïö©)
final_model = grid.best_estimator_

# Ï†ÑÏ≤¥ ÌõàÎ†® Îç∞Ïù¥ÌÑ∞Î°ú Ïû¨ÌïôÏäµ (val ÎÇòÎàÑÏßÄ ÏïäÍ≥† Ï†ÑÏ≤¥ ÏÇ¨Ïö©)
final_model.fit(X_scaled, y)

# ÌÖåÏä§Ìä∏ÏÖã ÏòàÏ∏°
y_test_pred = final_model.predict(X_test_scaled)
y_test_pred_labels = le.inverse_transform(y_test_pred)  # Ïà´Ïûê ‚Üí ÏõêÎûò ÌÅ¥ÎûòÏä§ Ïù¥Î¶Ñ

# Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ±
submission = sample.copy()
submission['species'] = y_test_pred_labels
submission.to_csv('iris_final_bestmodel_submission.csv', index=False)

print("ÏµúÏ¢Ö Ï†úÏ∂úÌååÏùº ÏÉùÏÑ± ÏôÑÎ£å: iris_final_bestmodel_submission.csv")


ÏµúÏ¢Ö Ï†úÏ∂úÌååÏùº ÏÉùÏÑ± ÏôÑÎ£å: iris_final_bestmodel_submission.csv
